#### 1. Import pandas library

In [50]:
import pandas as pd

#### 2. Import users table:

In [51]:
df_users = pd.read_csv('users_table.csv')
df_users.columns

Index(['Id', 'Reputation', 'CreationDate', 'DisplayName', 'LastAccessDate',
       'WebsiteUrl', 'Location', 'AboutMe', 'Views', 'UpVotes', 'DownVotes',
       'AccountId', 'Age', 'ProfileImageUrl'],
      dtype='object')

#### 3. Rename Id column to userId

In [52]:
df_users.rename(columns= {'Id': 'userId'}, inplace= True)
df_users.columns

Index(['userId', 'Reputation', 'CreationDate', 'DisplayName', 'LastAccessDate',
       'WebsiteUrl', 'Location', 'AboutMe', 'Views', 'UpVotes', 'DownVotes',
       'AccountId', 'Age', 'ProfileImageUrl'],
      dtype='object')

#### 4. Import posts table:

In [53]:
df_posts = pd.read_csv('posts_table.csv')
df_posts.columns

Index(['Id', 'PostTypeId', 'AcceptedAnswerId', 'CreaionDate', 'Score',
       'ViewCount', 'Body', 'OwnerUserId', 'LasActivityDate', 'Title', 'Tags',
       'AnswerCount', 'CommentCount', 'FavoriteCount', 'LastEditorUserId',
       'LastEditDate', 'CommunityOwnedDate', 'ParentId', 'ClosedDate',
       'OwnerDisplayName', 'LastEditorDisplayName'],
      dtype='object')

#### 5. Rename Id column to postId and OwnerUserId to userId

In [54]:
df_posts.rename(columns= {'Id': 'postId', 'OwnerUserId': 'userId'}, inplace= True)
df_posts.columns

Index(['postId', 'PostTypeId', 'AcceptedAnswerId', 'CreaionDate', 'Score',
       'ViewCount', 'Body', 'userId', 'LasActivityDate', 'Title', 'Tags',
       'AnswerCount', 'CommentCount', 'FavoriteCount', 'LastEditorUserId',
       'LastEditDate', 'CommunityOwnedDate', 'ParentId', 'ClosedDate',
       'OwnerDisplayName', 'LastEditorDisplayName'],
      dtype='object')

#### 6. Define new dataframes for users and posts with the following selected columns:
    **users columns**: userId, Reputation,Views,UpVotes,DownVotes
    **posts columns**: postId, Score,userId,ViewCount,CommentCount

In [55]:
new_df_users = df_users['userId, Reputation, Views, UpVotes, DownVotes'.split(', ')]
print(new_df_users.columns)
print('\n')
new_df_posts = df_posts['postId, Score, userId, ViewCount, CommentCount'.split(', ')]
print(new_df_posts.columns)

Index(['userId', 'Reputation', 'Views', 'UpVotes', 'DownVotes'], dtype='object')


Index(['postId', 'Score', 'userId', 'ViewCount', 'CommentCount'], dtype='object')


#### 7. Merge both dataframes, users and posts. 
You will need to make a [merge](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.merge.html) of posts and users dataframes.

In [56]:
new_df = new_df_users.merge(new_df_posts, on= 'userId')
new_df

Unnamed: 0,userId,Reputation,Views,UpVotes,DownVotes,postId,Score,ViewCount,CommentCount
0,-1,1,0,5007,1920,2175,0,,0
1,-1,1,0,5007,1920,8576,0,,0
2,-1,1,0,5007,1920,8578,0,,0
3,-1,1,0,5007,1920,8981,0,,0
4,-1,1,0,5007,1920,8982,0,,0
...,...,...,...,...,...,...,...,...,...
38957,45934,11,1,0,0,34003,1,115.0,2
38958,46192,36,1,0,0,40667,5,326.0,2
38959,46522,235,13,27,1,17461,3,166.0,0
38960,52371,221,2,0,0,27237,24,3357.0,5


#### 8. How many missing values do you have in your merged dataframe? On which columns?

In [57]:
null_cols = new_df.isnull().sum()
print(f'There are {null_cols[null_cols > 0][0]} missing values in {null_cols[null_cols > 0].index[0]} column.')

There are 23572 missing values in ViewCount column.


#### 9. You will need to make something with missing values.  Will you clean or filling them? Explain. 
**Remember** to check the results of your code before passing to the next step

In [58]:
print(f"In column {null_cols[null_cols > 0].index[0]}, {round(null_cols[null_cols > 0][0]/len(new_df)*100,2)}% are null \
values. Let's drop all rows where the column {null_cols[null_cols > 0].index[0]} has null values.")

In column ViewCount, 60.5% are null values. Let's drop all rows where the column ViewCount has null values.


In [59]:
new_df.dropna(subset= ['ViewCount'], inplace= True)

In [60]:
null_cols = new_df.isnull().sum().sum()
print(f'There are {null_cols} missing values in modified DataFrame.')

There are 0 missing values in modified DataFrame.


#### 10. Adjust the data types in order to avoid future issues. Which ones should be changed? 

In [61]:
# Let's print the data types of merged dataframe "new_df#
new_df.dtypes

userId            int64
Reputation        int64
Views             int64
UpVotes           int64
DownVotes         int64
postId            int64
Score             int64
ViewCount       float64
CommentCount      int64
dtype: object

In [62]:
# The column ViewCount has the counts of the views of a post. Let's change its type to int64. There is not half a view.
new_df['ViewCount'] = new_df['ViewCount'].astype('int64')
new_df.dtypes

userId          int64
Reputation      int64
Views           int64
UpVotes         int64
DownVotes       int64
postId          int64
Score           int64
ViewCount       int64
CommentCount    int64
dtype: object

In [63]:
# Let's check the content of new_df now
new_df

Unnamed: 0,userId,Reputation,Views,UpVotes,DownVotes,postId,Score,ViewCount,CommentCount
123,5,6792,1145,662,5,6,152,29229,5
131,5,6792,1145,662,5,103,28,1990,6
133,5,6792,1145,662,5,125,75,29261,2
145,5,6792,1145,662,5,423,156,64481,7
150,5,6792,1145,662,5,562,10,1005,1
...,...,...,...,...,...,...,...,...,...
38956,44995,11,9,0,0,44474,1,446,1
38957,45934,11,1,0,0,34003,1,115,2
38958,46192,36,1,0,0,40667,5,326,2
38959,46522,235,13,27,1,17461,3,166,0


In [64]:
# Let's check the descritive statistics of each column of new_df, adding also interquartile range
stats = new_df.describe().transpose()
stats['IQR'] = stats['75%'] - stats['25%']
stats

Unnamed: 0,count,mean,std,min,25%,50%,75%,max,IQR
userId,15390.0,7863.802079,5474.997853,5.0,3305.0,7284.5,11889.0,52371.0,8584.0
Reputation,15390.0,866.745874,3160.304354,1.0,36.0,148.0,477.0,87393.0,441.0
Views,15390.0,152.623522,644.478541,0.0,4.0,18.0,73.0,20932.0,69.0
UpVotes,15390.0,105.275179,507.47492,0.0,0.0,5.0,37.0,11442.0,37.0
DownVotes,15390.0,3.204029,21.851984,0.0,0.0,0.0,0.0,779.0,0.0
postId,15390.0,24849.074854,13445.16743,1.0,13639.25,24665.0,35355.25,48325.0,21716.0
Score,15390.0,4.150032,6.839064,-13.0,1.0,3.0,5.0,192.0,4.0
ViewCount,15390.0,1196.225991,3742.952815,14.0,173.0,385.0,967.0,175495.0,794.0
CommentCount,15390.0,2.322287,2.764284,0.0,0.0,2.0,4.0,29.0,4.0


In [65]:
# Let's drop the rows userId and postId, because they are just ID numbers and does not make sense to consider them 
# relatively to outliers
stats.drop(index= ['userId', 'postId'], inplace= True)

In [66]:
# Let's compile each row of new_df where there are outlier and indicate what is the name of column where it is 
outliers = pd.DataFrame(columns=stats.index)
 
for col in stats.index:
    iqr = stats.at[col,'IQR']
    cutoff = iqr * 1.5
    lower = stats.at[col,'25%'] - cutoff
    upper = stats.at[col,'75%'] + cutoff
    results = new_df[(new_df[col] < lower) | 
                   (new_df[col] > upper)].copy()
    results['Outlier'] = col
    outliers = outliers.append(results)

In [69]:
# Let's check the content of this dataframe
outliers

Unnamed: 0,Reputation,Views,UpVotes,DownVotes,Score,ViewCount,CommentCount,userId,postId,Outlier
123,6792,1145,662,5,152,29229,5,5.0,6.0,Reputation
131,6792,1145,662,5,28,1990,6,5.0,103.0,Reputation
133,6792,1145,662,5,75,29261,2,5.0,125.0,Reputation
145,6792,1145,662,5,156,64481,7,5.0,423.0,Reputation
150,6792,1145,662,5,10,1005,1,5.0,562.0,Reputation
...,...,...,...,...,...,...,...,...,...,...
38593,30,8,3,0,0,265,11,18482.0,47284.0,CommentCount
38596,178,11,2,0,2,537,14,18492.0,47299.0,CommentCount
38621,70,9,1,0,0,87,11,18531.0,47750.0,CommentCount
38625,31,32,0,0,0,628,11,18546.0,47444.0,CommentCount


In [71]:
# Let's export it to a csv file
outliers.to_csv('outliers.csv')