# Data Cleaning 

#### 1. Import pandas library.

In [5]:
import numpy as np
import pandas as pd

#### 2. Import the users table.

In [7]:
users = pd.read_csv("../data/users.csv")

In [8]:
users.head()

Unnamed: 0.1,Unnamed: 0,Id,Reputation,Views,UpVotes,DownVotes
0,0,-1,1,0,5007,1920
1,1,2,101,25,3,0
2,2,3,101,22,19,0
3,3,4,101,11,0,0
4,4,5,6792,1145,662,5


#### 3. Rename Id column to userId.

In [9]:
users = users.rename(columns ={"Id":"userId"})

In [10]:
users.head(15)

Unnamed: 0.1,Unnamed: 0,userId,Reputation,Views,UpVotes,DownVotes
0,0,-1,1,0,5007,1920
1,1,2,101,25,3,0
2,2,3,101,22,19,0
3,3,4,101,11,0,0
4,4,5,6792,1145,662,5
5,5,6,457,114,47,0
6,6,7,429,56,20,0
7,7,8,6764,1089,604,25
8,8,10,121,20,2,0
9,9,11,136,10,10,0


#### 4. Import the posts table. 

In [11]:
posts = pd.read_csv("../data/posts.csv")

In [51]:
posts.head(15)

Unnamed: 0.1,Unnamed: 0,postId,userId,Score,ViewCount,CommentCount
0,0,1,8.0,23,1278.0,1
1,1,2,24.0,22,8198.0,1
2,2,3,18.0,54,3613.0,4
3,3,4,23.0,13,5224.0,2
4,4,5,23.0,81,,3
5,5,6,5.0,152,29229.0,5
6,6,7,38.0,76,5808.0,3
7,7,8,37.0,0,288.0,2
8,8,9,50.0,13,,3
9,9,10,24.0,23,21925.0,4


In [53]:
posts.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 91976 entries, 0 to 91975
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Unnamed: 0    91976 non-null  int64  
 1   postId        91976 non-null  int64  
 2   userId        90584 non-null  float64
 3   Score         91976 non-null  int64  
 4   ViewCount     42921 non-null  float64
 5   CommentCount  91976 non-null  int64  
dtypes: float64(2), int64(4)
memory usage: 4.2 MB


In [None]:
#user id has some missing values

#### 5. Rename Id column to postId and OwnerUserId to userId.

In [13]:
posts = posts.rename(columns = {"Id":"postId","OwnerUserId":"userId"})

In [14]:
posts.head()

Unnamed: 0.1,Unnamed: 0,postId,userId,Score,ViewCount,CommentCount
0,0,1,8.0,23,1278.0,1
1,1,2,24.0,22,8198.0,1
2,2,3,18.0,54,3613.0,4
3,3,4,23.0,13,5224.0,2
4,4,5,23.0,81,,3


#### 6. Define new dataframes for users and posts with the following selected columns:
**users_sliced columns**: userId, Reputation, Views, UpVotes, DownVotes  
**posts_sliced columns**: postId, Score, userId, ViewCount, CommentCount

In [15]:
users_sliced= users.loc[:,["userId", "Reputation", "Views", "UpVotes", "DownVotes"]]
users_sliced

Unnamed: 0,userId,Reputation,Views,UpVotes,DownVotes
0,-1,1,0,5007,1920
1,2,101,25,3,0
2,3,101,22,19,0
3,4,101,11,0,0
4,5,6792,1145,662,5
...,...,...,...,...,...
40320,55743,1,0,0,0
40321,55744,6,1,0,0
40322,55745,101,0,0,0
40323,55746,106,1,0,0


In [17]:
posts_sliced =posts.loc[:,["postId", "Score", "userId", "ViewCount", "CommentCount"]]
posts_sliced

Unnamed: 0,postId,Score,userId,ViewCount,CommentCount
0,1,23,8.0,1278.0,1
1,2,22,24.0,8198.0,1
2,3,54,18.0,3613.0,4
3,4,13,23.0,5224.0,2
4,5,81,23.0,,3
...,...,...,...,...,...
91971,115374,2,805.0,,2
91972,115375,0,49365.0,9.0,0
91973,115376,1,55746.0,5.0,2
91974,115377,0,805.0,,0


#### 7. Merge the two dataframes created in the step above (8), users_sliced and posts_sliced. 
You will need to make a [merge](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.merge.html) of posts and users dataframes.

In [45]:
posts_users = posts_sliced.merge(right =users_sliced,)

In [46]:
posts_users.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 90584 entries, 0 to 90583
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   postId        90584 non-null  int64  
 1   Score         90584 non-null  int64  
 2   userId        90584 non-null  float64
 3   ViewCount     42188 non-null  float64
 4   CommentCount  90584 non-null  int64  
 5   Reputation    90584 non-null  int64  
 6   Views         90584 non-null  int64  
 7   UpVotes       90584 non-null  int64  
 8   DownVotes     90584 non-null  int64  
dtypes: float64(2), int64(7)
memory usage: 6.9 MB


In [28]:
posts_users.head(15)

Unnamed: 0,postId,Score,userId,ViewCount,CommentCount,Reputation,Views,UpVotes,DownVotes
0,1,23,8.0,1278.0,1,6764,1089,604,25
1,16,16,8.0,,3,6764,1089,604,25
2,36,41,8.0,67396.0,7,6764,1089,604,25
3,65,14,8.0,,3,6764,1089,604,25
4,78,33,8.0,,4,6764,1089,604,25
5,111,8,8.0,,3,6764,1089,604,25
6,114,30,8.0,1220.0,1,6764,1089,604,25
7,129,4,8.0,,0,6764,1089,604,25
8,142,5,8.0,,0,6764,1089,604,25
9,168,17,8.0,1022.0,1,6764,1089,604,25


In [29]:
posts_users.tail(15)

Unnamed: 0,postId,Score,userId,ViewCount,CommentCount,Reputation,Views,UpVotes,DownVotes
90569,115338,0,55729.0,,0,16,1,0,0
90570,115340,0,55730.0,18.0,1,1,0,0,0
90571,115348,0,55564.0,18.0,3,101,0,1,0
90572,115350,0,55731.0,3.0,0,1,0,0,0
90573,115351,1,7331.0,9.0,0,6,2,0,0
90574,115352,0,55734.0,16.0,0,1,0,0,0
90575,115356,1,55733.0,15.0,0,6,1,0,0
90576,115360,2,55738.0,40.0,4,11,0,0,0
90577,115361,0,55383.0,12.0,1,1,0,0,0
90578,115364,0,52858.0,11.0,3,101,2,1,0


#### 8. How many missing values do you have in your merged dataframe? On which columns?

In [47]:
posts_users.info()
#Those are the non missing values per column, I can see already that
#column has less values- we could also use post_users.count()
#returning a Panda series with columns and counts

<class 'pandas.core.frame.DataFrame'>
Int64Index: 90584 entries, 0 to 90583
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   postId        90584 non-null  int64  
 1   Score         90584 non-null  int64  
 2   userId        90584 non-null  float64
 3   ViewCount     42188 non-null  float64
 4   CommentCount  90584 non-null  int64  
 5   Reputation    90584 non-null  int64  
 6   Views         90584 non-null  int64  
 7   UpVotes       90584 non-null  int64  
 8   DownVotes     90584 non-null  int64  
dtypes: float64(2), int64(7)
memory usage: 6.9 MB


In [42]:
#We get the missing values per column
posts_users.isnull().sum()
#There are on ViewCount more missing values than values

postId              0
Score               0
userId              0
ViewCount       48396
CommentCount        0
Reputation          0
Views               0
UpVotes             0
DownVotes           0
dtype: int64

#### 9. You will need to make something with missing values.  Will you clean or filling them? Explain. 
**Remember** to check the results of your code before going to the next step.

Decide what to do, depending of the case and explain why we did that.

In [43]:
#With a bigger amount of bigger values than data perse I would 
#drop the colum.
posts_users = posts_users.drop(columns="ViewCount")

#### 10. Adjust the data types in order to avoid future issues. Which ones should be changed? 

In [54]:
posts_users.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 90584 entries, 0 to 90583
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   postId        90584 non-null  int64  
 1   Score         90584 non-null  int64  
 2   userId        90584 non-null  float64
 3   ViewCount     42188 non-null  float64
 4   CommentCount  90584 non-null  int64  
 5   Reputation    90584 non-null  int64  
 6   Views         90584 non-null  int64  
 7   UpVotes       90584 non-null  int64  
 8   DownVotes     90584 non-null  int64  
dtypes: float64(2), int64(7)
memory usage: 6.9 MB


In [49]:
users.info()
#user Id here a integer

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40325 entries, 0 to 40324
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype
---  ------      --------------  -----
 0   Unnamed: 0  40325 non-null  int64
 1   userId      40325 non-null  int64
 2   Reputation  40325 non-null  int64
 3   Views       40325 non-null  int64
 4   UpVotes     40325 non-null  int64
 5   DownVotes   40325 non-null  int64
dtypes: int64(6)
memory usage: 1.8 MB


In [55]:
posts.info()
#We have lost some entries of Posts using this kind of merge...
#userId is here a float, we also see we have lost some

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 91976 entries, 0 to 91975
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Unnamed: 0    91976 non-null  int64  
 1   postId        91976 non-null  int64  
 2   userId        90584 non-null  float64
 3   Score         91976 non-null  int64  
 4   ViewCount     42921 non-null  float64
 5   CommentCount  91976 non-null  int64  
dtypes: float64(2), int64(4)
memory usage: 4.2 MB


In [57]:
posts["ViewCount"].value_counts()

31.0      300
38.0      297
27.0      281
37.0      280
24.0      276
         ... 
4184.0      1
5446.0      1
8200.0      1
4322.0      1
1613.0      1
Name: ViewCount, Length: 3714, dtype: int64

In [50]:
posts=posts.astype(int)

ValueError: Cannot convert non-finite values (NA or inf) to integer