#### 1. Import pandas library

In [2]:
import pandas as pd
import numpy as np

#### 2. Import users table:

In [3]:
tb_users = pd.read_csv('users_table.csv')
#tb_users.columns
#tb.head()

#### 3. Rename Id column to userId

In [4]:
tb_users.rename(columns={'Id':'userId'},inplace=True)
#tb_users.columns

#### 4. Import posts table:

In [5]:
tb_posts = pd.read_csv('posts_table.csv')
#tb_posts.columns

#### 5. Rename Id column to postId and OwnerUserId to userId

In [6]:
tb_posts.rename(columns={'Id':'postId',
                         'OwnerUserId': 'userId'},inplace=True)
#tb_posts.columns

#### 6. Define new dataframes for users and posts with the following selected columns:
    **users columns**: userId, Reputation,Views,UpVotes,DownVotes
    **posts columns**: postId, Score,userId,ViewCount,CommentCount

In [7]:
new_users = tb_users[['userId', 'Reputation','Views','UpVotes','DownVotes']]
new_posts = tb_posts[['postId', 'Score','userId','ViewCount','CommentCount']]

#### 7. Merge both dataframes, users and posts. 
You will need to make a [merge](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.merge.html) of posts and users dataframes.

In [8]:
new_df = new_users.merge(new_posts, left_on='userId', right_on='userId')
#new_df.head()

#### 8. How many missing values do you have in your merged dataframe? On which columns?

In [9]:
new_df.isnull().sum()
new_df.isna().sum()

userId              0
Reputation          0
Views               0
UpVotes             0
DownVotes           0
postId              0
Score               0
ViewCount       23572
CommentCount        0
dtype: int64

#### 9. You will need to make something with missing values.  Will you clean or filling them? Explain. 
**Remember** to check the results of your code before passing to the next step

In [10]:
new_df.shape
"""
The total lines in the data is 38962 and we have 23572 that are missing, so we have 60,5% of the total rows with no information. So I decided to drop this column
"""
new_df.drop(columns='ViewCount',inplace=True)
#new_df.columns

#### 10. Adjust the data types in order to avoid future issues. Which ones should be changed? 

In [11]:
"""Low Variance"""
low_variance = []
 
for col in new_df._get_numeric_data():
    minimum = min(new_df[col])
    ninety_perc = np.percentile(new_df[col], 90)
    if ninety_perc == minimum:
        low_variance.append(col)
 
print(low_variance) #gives no column with low variance

#the list is empty, so we can conclued that there isn't no column with low variance

[]


In [21]:
"""Extreme Values and Outliers"""
stats = new_df.describe().transpose()
stats['IQR'] = stats['75%'] - stats['25%']
stats

Unnamed: 0,count,mean,std,min,25%,50%,75%,max,IQR
userId,38962.0,6079.063087,5224.896435,-1.0,1317.0,4856.0,9651.0,55226.0,8334.0
Reputation,38962.0,7281.091679,15164.527714,1.0,147.0,909.0,7931.0,87393.0,7784.0
Views,38962.0,1400.648016,3423.886887,0.0,16.0,124.0,1050.0,20932.0,1034.0
UpVotes,38962.0,914.799677,2296.52706,0.0,4.0,65.0,582.0,11442.0,578.0
DownVotes,38962.0,43.84105,161.797079,0.0,0.0,1.0,16.0,1920.0,16.0
postId,38962.0,22960.799651,13696.932471,1.0,11325.25,22373.5,33688.5,48325.0,22363.25
Score,38962.0,4.083081,6.561843,-19.0,1.0,2.0,5.0,192.0,4.0
CommentCount,38962.0,2.01463,2.674018,0.0,0.0,1.0,3.0,45.0,3.0


In [22]:
outliers = pd.DataFrame(columns=new_df.columns)
 
for col in stats.index[1:]:
    iqr = stats.at[col,'IQR']
    cutoff = iqr * 1.5
    lower = stats.at[col,'25%'] - cutoff
    upper = stats.at[col,'75%'] + cutoff
    results = new_df[(new_df[col] < lower) | 
                   (new_df[col] > upper)].copy()
    results['Outlier'] = col
    outliers = outliers.append(results)

outliers

#we have some possibles outileirs in 27009 rows in different columns. So we have to be carefull with that

Unnamed: 0,userId,Reputation,Views,UpVotes,DownVotes,postId,Score,CommentCount,Outlier
1808,183,22625,4069,2496,45,201,8,0,Reputation
1809,183,22625,4069,2496,45,202,11,0,Reputation
1810,183,22625,4069,2496,45,203,16,2,Reputation
1811,183,22625,4069,2496,45,204,7,2,Reputation
1812,183,22625,4069,2496,45,210,9,3,Reputation
...,...,...,...,...,...,...,...,...,...
38621,18531,70,9,1,0,47750,0,11,CommentCount
38625,18546,31,32,0,0,47444,0,11,CommentCount
38770,19788,180,14,13,0,47989,3,10,CommentCount
38861,21466,565,19,0,0,7224,89,10,CommentCount


In [130]:
"""Data Type Correction"""
new_df.dtypes 

#the types are corrected

userId          int64
Reputation      int64
Views           int64
UpVotes         int64
DownVotes       int64
postId          int64
Score           int64
CommentCount    int64
dtype: object

In [127]:
"""Cleaning Text and Removing Special Characters"""
#This data have all columns with INT and we just need to check the columns names. We previous did that, and it's ok

'Cleaning Text and Removing Special Characters'

In [129]:
"""Finding and Removing Duplicates"""
before = len(new_df)
new_df = new_df.drop_duplicates()
after = len(new_df)
print('Number of duplicate records dropped: ', str(before - after))

Number of duplicate records dropped:  0
