# Data Cleaning 

#### 1. Import pandas library.

In [1]:
import pandas as pd

#### 2. Import the users table.

In [13]:
users = pd.read_csv("../data/users.csv")

#### 3. Rename Id column to userId.

In [14]:
users.head()

Unnamed: 0.1,Unnamed: 0,userId,Reputation,Views,UpVotes,DownVotes
0,0,-1,1,0,5007,1920
1,1,2,101,25,3,0
2,2,3,101,22,19,0
3,3,4,101,11,0,0
4,4,5,6792,1145,662,5


In [15]:
users.columns

Index(['Unnamed: 0', 'userId', 'Reputation', 'Views', 'UpVotes', 'DownVotes'], dtype='object')

In [16]:
col_names = ['Unnamed: 0', 'USER_ID', 'Reputation', 'Views', 'UpVotes', 'DownVotes']

In [17]:
users.columns=col_names

In [18]:
users.head()

Unnamed: 0.1,Unnamed: 0,USER_ID,Reputation,Views,UpVotes,DownVotes
0,0,-1,1,0,5007,1920
1,1,2,101,25,3,0
2,2,3,101,22,19,0
3,3,4,101,11,0,0
4,4,5,6792,1145,662,5


#### 4. Import the posts table. 

In [19]:
posts = pd.read_csv("../data/posts.csv")

#### 5. Rename Id column to postId and OwnerUserId to userId.

In [20]:
posts.columns

Index(['Unnamed: 0', 'PostId', 'userId', 'Score', 'ViewCount', 'CommentCount'], dtype='object')

In [21]:
col_names_posts = ['Unnamed: 0', 'POST_ID', 'USER_ID', 'Score', 'ViewCount', 'CommentCount']

In [22]:
posts.columns=col_names_posts

In [23]:
posts.head()

Unnamed: 0.1,Unnamed: 0,POST_ID,USER_ID,Score,ViewCount,CommentCount
0,0,1,8.0,23,1278.0,1
1,1,2,24.0,22,8198.0,1
2,2,3,18.0,54,3613.0,4
3,3,4,23.0,13,5224.0,2
4,4,5,23.0,81,,3


#### 6. Define new dataframes for users and posts with the following selected columns:
**users_sliced columns**: userId, Reputation, Views, UpVotes, DownVotes  
**posts_sliced columns**: postId, Score, userId, ViewCount, CommentCount

#### 7. Merge the two dataframes created in the step above (8), users_sliced and posts_sliced. 
You will need to make a [merge](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.merge.html) of posts and users dataframes.

In [None]:
users_sliced = users[["USER_ID", "Reputation","Views","UpVotes","DownVotes"]].copy()
posts_sliced = posts[["POST_ID","Score","USER_ID","ViewCount","CommentCount"]].copy()

In [39]:
users_sliced.head()

Unnamed: 0,USER_ID,Reputation,Views,UpVotes,DownVotes
0,-1,1,0,5007,1920
1,2,101,25,3,0
2,3,101,22,19,0
3,4,101,11,0,0
4,5,6792,1145,662,5


In [40]:
posts_sliced.head()

Unnamed: 0,POST_ID,Score,USER_ID,ViewCount,CommentCount
0,1,23,8.0,1278.0,1
1,2,22,24.0,8198.0,1
2,3,54,18.0,3613.0,4
3,4,13,23.0,5224.0,2
4,5,81,23.0,,3


In [37]:
users_sliced["USER_ID"].unique()

array([   -1,     2,     3, ..., 55745, 55746, 55747])

In [38]:
posts_sliced["USER_ID"].unique()

array([8.0000e+00, 2.4000e+01, 1.8000e+01, ..., 3.5801e+04, 4.9365e+04,
       5.5746e+04])

In [42]:
posts_sliced.sort_values("USER_ID")

Unnamed: 0,POST_ID,Score,USER_ID,ViewCount,CommentCount
41252,49878,0,-1.0,,0
36981,44695,0,-1.0,,0
37080,44809,0,-1.0,,0
52904,65171,0,-1.0,,0
52900,65166,0,-1.0,,0
...,...,...,...,...,...
91331,114678,0,,20.0,2
91454,114812,0,,16.0,1
91456,114815,1,,14.0,5
91833,115225,0,,8.0,0


In [35]:
users_posts = users_sliced.merge(right=posts_sliced, how="inner", on= "USER_ID")
users_posts.head()

Unnamed: 0,USER_ID,Reputation,Views,UpVotes,DownVotes,POST_ID,Score,ViewCount,CommentCount
0,-1,1,0,5007,1920,2175,0,,0
1,-1,1,0,5007,1920,8576,0,,0
2,-1,1,0,5007,1920,8578,0,,0
3,-1,1,0,5007,1920,8981,0,,0
4,-1,1,0,5007,1920,8982,0,,0


#### 8. How many missing values do you have in your merged dataframe? On which columns?

In [43]:
users_posts.columns

Index(['USER_ID', 'Reputation', 'Views', 'UpVotes', 'DownVotes', 'POST_ID',
       'Score', 'ViewCount', 'CommentCount'],
      dtype='object')

In [44]:
colnames = ['USER_ID', 'Reputation', 'Views', 'UpVotes', 'DownVotes', 'POST_ID',
       'Score', 'ViewCount', 'CommentCount']

In [47]:
for name in colnames:
    print(str(name)+" " + str(users_posts[name].isna().sum()))
    

USER_ID 0
Reputation 0
Views 0
UpVotes 0
DownVotes 0
POST_ID 0
Score 0
ViewCount 48396
CommentCount 0


In [49]:
users_posts["ViewCount"].unique()

array([       nan, 2.9229e+04, 1.9900e+03, ..., 3.7630e+03, 3.3570e+03,
       1.0000e+00])

In [51]:
users_posts["ViewCount"].value_counts()

38.0       295
31.0       293
37.0       277
27.0       277
24.0       274
          ... 
25962.0      1
2586.0       1
2067.0       1
4443.0       1
3940.0       1
Name: ViewCount, Length: 3654, dtype: int64

In [50]:
len(users_posts["ViewCount"])

90584

#### 9. You will need to make something with missing values.  Will you clean or filling them? Explain. 
**Remember** to check the results of your code before going to the next step.

In [65]:
#replace for zeros
users_posts.fillna(0,inplace=True)

In [66]:
users_posts.head()

Unnamed: 0,USER_ID,Reputation,Views,UpVotes,DownVotes,POST_ID,Score,ViewCount,CommentCount
0,-1,1,0,5007,1920,2175,0,0.0,0
1,-1,1,0,5007,1920,8576,0,0.0,0
2,-1,1,0,5007,1920,8578,0,0.0,0
3,-1,1,0,5007,1920,8981,0,0.0,0
4,-1,1,0,5007,1920,8982,0,0.0,0


#### 10. Adjust the data types in order to avoid future issues. Which ones should be changed? 

In [None]:
#with the zeros all of them are ok