# Data Cleaning 

#### 1. Import pandas library.

In [1]:
import pandas as pd

In [23]:
import numpy as np

#### 2. Import the users table.

In [2]:
users = pd.read_csv('../data/users.csv')
users

Unnamed: 0.1,Unnamed: 0,userId,Reputation,Views,UpVotes,DownVotes
0,0,-1,1,0,5007,1920
1,1,2,101,25,3,0
2,2,3,101,22,19,0
3,3,4,101,11,0,0
4,4,5,6792,1145,662,5
...,...,...,...,...,...,...
40320,40320,55743,1,0,0,0
40321,40321,55744,6,1,0,0
40322,40322,55745,101,0,0,0
40323,40323,55746,106,1,0,0


#### 3. Rename Id column to userId.

In [3]:
users_cols = ['Unnamed: 0','Id','Reputation','Views','UpVotes','DownVotes']

In [4]:
users.columns = users_cols

In [5]:
users

Unnamed: 0.1,Unnamed: 0,Id,Reputation,Views,UpVotes,DownVotes
0,0,-1,1,0,5007,1920
1,1,2,101,25,3,0
2,2,3,101,22,19,0
3,3,4,101,11,0,0
4,4,5,6792,1145,662,5
...,...,...,...,...,...,...
40320,40320,55743,1,0,0,0
40321,40321,55744,6,1,0,0
40322,40322,55745,101,0,0,0
40323,40323,55746,106,1,0,0


#### 4. Import the posts table. 

In [6]:
posts = pd.read_csv('../data/posts.csv')

#### 5. Rename Id column to postId and OwnerUserId to userId.

In [7]:
posts_cols = ['Unnamed: 0','Id','OwnerUserId','Score','ViewCount','CommentCount']

In [8]:
posts.columns = posts_cols

In [9]:
posts

Unnamed: 0.1,Unnamed: 0,Id,OwnerUserId,Score,ViewCount,CommentCount
0,0,1,8.0,23,1278.0,1
1,1,2,24.0,22,8198.0,1
2,2,3,18.0,54,3613.0,4
3,3,4,23.0,13,5224.0,2
4,4,5,23.0,81,,3
...,...,...,...,...,...,...
91971,91971,115374,805.0,2,,2
91972,91972,115375,49365.0,0,9.0,0
91973,91973,115376,55746.0,1,5.0,2
91974,91974,115377,805.0,0,,0


#### 6. Define new dataframes for users and posts with the following selected columns:
**users_sliced columns**: userId, Reputation, Views, UpVotes, DownVotes  
**posts_sliced columns**: postId, Score, userId, ViewCount, CommentCount

In [10]:
users.drop(['Unnamed: 0'], axis = 1, inplace = True)

In [11]:
users

Unnamed: 0,Id,Reputation,Views,UpVotes,DownVotes
0,-1,1,0,5007,1920
1,2,101,25,3,0
2,3,101,22,19,0
3,4,101,11,0,0
4,5,6792,1145,662,5
...,...,...,...,...,...
40320,55743,1,0,0,0
40321,55744,6,1,0,0
40322,55745,101,0,0,0
40323,55746,106,1,0,0


In [12]:
posts.drop(['Unnamed: 0'], axis = 1, inplace = True)

In [13]:
posts

Unnamed: 0,Id,OwnerUserId,Score,ViewCount,CommentCount
0,1,8.0,23,1278.0,1
1,2,24.0,22,8198.0,1
2,3,18.0,54,3613.0,4
3,4,23.0,13,5224.0,2
4,5,23.0,81,,3
...,...,...,...,...,...
91971,115374,805.0,2,,2
91972,115375,49365.0,0,9.0,0
91973,115376,55746.0,1,5.0,2
91974,115377,805.0,0,,0


In [14]:
new_posts = ['Id', 'Score', 'OwnerUserId', 'ViewCount', 'CommentCount']

In [15]:
posts = posts.reindex(columns = new_posts)

In [16]:
posts

Unnamed: 0,Id,Score,OwnerUserId,ViewCount,CommentCount
0,1,23,8.0,1278.0,1
1,2,22,24.0,8198.0,1
2,3,54,18.0,3613.0,4
3,4,13,23.0,5224.0,2
4,5,81,23.0,,3
...,...,...,...,...,...
91971,115374,2,805.0,,2
91972,115375,0,49365.0,9.0,0
91973,115376,1,55746.0,5.0,2
91974,115377,0,805.0,,0


#### 7. Merge the two dataframes created in the step above (8), users_sliced and posts_sliced. 
You will need to make a [merge](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.merge.html) of posts and users dataframes.

In [17]:
dataframe = users.merge(posts, on=['Id'])

In [18]:
dataframe

Unnamed: 0,Id,Reputation,Views,UpVotes,DownVotes,Score,OwnerUserId,ViewCount,CommentCount
0,2,101,25,3,0,22,24.0,8198.0,1
1,3,101,22,19,0,54,18.0,3613.0,4
2,4,101,11,0,0,13,23.0,5224.0,2
3,5,6792,1145,662,5,81,23.0,,3
4,6,457,114,47,0,152,5.0,29229.0,5
...,...,...,...,...,...,...,...,...,...
32052,55743,1,0,0,0,0,24164.0,,1
32053,55744,6,1,0,0,1,10961.0,,7
32054,55745,101,0,0,0,3,4598.0,,2
32055,55746,106,1,0,0,3,20315.0,59.0,0


#### 8. How many missing values do you have in your merged dataframe? On which columns?

In [25]:
missingdata = dataframe.isnull().sum()

In [26]:
missingdata

Id                0
Reputation        0
Views             0
UpVotes           0
DownVotes         0
Score             0
OwnerUserId     772
ViewCount         0
CommentCount      0
dtype: int64

#### 9. You will need to make something with missing values.  Will you clean or filling them? Explain. 
**Remember** to check the results of your code before going to the next step.

In [24]:
#I would drop rows with  missing Id number because it's not useful data, and fill the missing ViewCount values by 0.
dataframe['ViewCount'] = dataframe['ViewCount'].replace(np.nan, 0)
dataframe

Unnamed: 0,Id,Reputation,Views,UpVotes,DownVotes,Score,OwnerUserId,ViewCount,CommentCount
0,2,101,25,3,0,22,24.0,8198.0,1
1,3,101,22,19,0,54,18.0,3613.0,4
2,4,101,11,0,0,13,23.0,5224.0,2
3,5,6792,1145,662,5,81,23.0,0.0,3
4,6,457,114,47,0,152,5.0,29229.0,5
...,...,...,...,...,...,...,...,...,...
32052,55743,1,0,0,0,0,24164.0,0.0,1
32053,55744,6,1,0,0,1,10961.0,0.0,7
32054,55745,101,0,0,0,3,4598.0,0.0,2
32055,55746,106,1,0,0,3,20315.0,59.0,0


#### 10. Adjust the data types in order to avoid future issues. Which ones should be changed? 