#### 1. Import pandas library

In [1]:
import pandas as pd

#### 2. Load data (users.csv, posts.csv)

In [2]:
users = pd.read_csv('./users.csv')

In [3]:
posts = pd.read_csv('./posts.csv')

#### 3. Rename id column to user_id

In [4]:
users.rename(columns={'id': 'user_id'}, inplace=True)

#### 4. Rename id column to post_id and owner_user_id to user_id

In [5]:
posts.rename(columns={'id': 'post_id', 'owner_user_id': 'user_id'}, inplace=True)

#### 5. Define new dataframes for users and posts with the following selected columns:
    **users columns**: user_id, reputation,views,up_votes,down_votes
    **posts columns**: post_id, score,user_id,view_count,comment_count

In [6]:
new_users = users[['user_id', 'reputation', 'views', 'up_votes', 'down_votes']]
new_users.head()

Unnamed: 0,user_id,reputation,views,up_votes,down_votes
0,107658,305,91,2,0
1,218597,6559,374,361,25
2,326360,1802,97,40,14
3,379556,2335,261,125,10
4,450456,7746,3598,256,50


In [7]:
new_posts = posts[['post_id', 'score', 'user_id', 'view_count', 'comment_count']]
new_posts.head()

Unnamed: 0,post_id,score,user_id,view_count,comment_count
0,30336926,1,,14,3
1,36873524,1,,18,0
2,36605876,1,,19,0
3,36718461,1,,17,0
4,30434893,0,,15,0


#### 6. Merge both dataframes, users and posts. 
You will need to make a [merge](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.merge.html) of posts and users dataframes.

In [8]:
merged_df = new_users.merge(new_posts)

In [9]:
merged_df.head()

Unnamed: 0,user_id,reputation,views,up_votes,down_votes,post_id,score,view_count,comment_count
0,450456,7746,3598,256,50,34989087,0,19,0
1,450456,7746,3598,256,50,4142174,0,15,0
2,1517244,4172,224,1362,32,37211628,2,18,0
3,1870509,897,105,86,2,30260248,1,11,4
4,101719,4055,332,174,16,34528617,0,18,0


#### 7. How many missing values do you have in your merged dataframe? 

In [10]:
merged_df.isnull().sum()

user_id          0
reputation       0
views            0
up_votes         0
down_votes       0
post_id          0
score            0
view_count       0
comment_count    0
dtype: int64

There are no missing values

#### Bonus: Identify extreme values in your merged dataframe, create a dataframe called outliers with the same columns as our data set and calculate the bounds. The values of the outliers dataframe will be the values of the merged_df that fall outside that bounds. You will need to save your outliers dataframe to a csv file on your-code folder. Hint: post_id cannot have outliers!

An outlier is a point which falls more than 1.5 times the interquartile range above the third quartile or below the first quartile.

##### IQR (interquartile range)
- 1st quartile - (1.5 * IQR) 
- 3rd quartile + (1.5 * IQR)

In [11]:
# Look for the stats, transpose the dataframe to have the correct columns in place and remove 'user_id' and 'post_id'
stats = merged_df.copy().describe().transpose()
stats.drop(['user_id', 'post_id'], inplace=True)
stats

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
reputation,1065.0,7251.176526,24021.612286,1.0,246.0,1145.0,5206.0,287170.0
views,1065.0,615.24507,1497.407529,0.0,39.0,145.0,496.0,23103.0
up_votes,1065.0,397.688263,747.628862,0.0,15.0,96.0,375.0,7886.0
down_votes,1065.0,49.415023,177.308773,0.0,0.0,3.0,20.0,2692.0
score,1065.0,0.123944,0.578141,-7.0,0.0,0.0,0.0,3.0
view_count,1065.0,15.606573,3.711159,3.0,13.0,17.0,19.0,20.0
comment_count,1065.0,0.926761,1.627503,0.0,0.0,0.0,1.0,11.0


In [12]:
# Calculate the IQR and add it as a column to the previous dataframe
stats['IQR'] = stats['75%'] - stats['25%']
stats

Unnamed: 0,count,mean,std,min,25%,50%,75%,max,IQR
reputation,1065.0,7251.176526,24021.612286,1.0,246.0,1145.0,5206.0,287170.0,4960.0
views,1065.0,615.24507,1497.407529,0.0,39.0,145.0,496.0,23103.0,457.0
up_votes,1065.0,397.688263,747.628862,0.0,15.0,96.0,375.0,7886.0,360.0
down_votes,1065.0,49.415023,177.308773,0.0,0.0,3.0,20.0,2692.0,20.0
score,1065.0,0.123944,0.578141,-7.0,0.0,0.0,0.0,3.0,0.0
view_count,1065.0,15.606573,3.711159,3.0,13.0,17.0,19.0,20.0,6.0
comment_count,1065.0,0.926761,1.627503,0.0,0.0,0.0,1.0,11.0,1.0


In [13]:
# Transpose again so we have a new dataset with the same columns as the previous one
outliers = stats.transpose()
outliers.drop(axis=0, index='IQR', inplace=True)
outliers

Unnamed: 0,reputation,views,up_votes,down_votes,score,view_count,comment_count
count,1065.0,1065.0,1065.0,1065.0,1065.0,1065.0,1065.0
mean,7251.176526,615.24507,397.688263,49.415023,0.123944,15.606573,0.926761
std,24021.612286,1497.407529,747.628862,177.308773,0.578141,3.711159,1.627503
min,1.0,0.0,0.0,0.0,-7.0,3.0,0.0
25%,246.0,39.0,15.0,0.0,0.0,13.0,0.0
50%,1145.0,145.0,96.0,3.0,0.0,17.0,0.0
75%,5206.0,496.0,375.0,20.0,0.0,19.0,1.0
max,287170.0,23103.0,7886.0,2692.0,3.0,20.0,11.0


In [14]:
# I should loop through all the columns here and find the outliers, but I don't know how to do it.