# Data Cleaning 

#### 1. Import pandas library.

In [2]:
import pandas as pd

#### 2. Import pymysql and sqlalchemy as you have learnt in the lesson of importing/exporting data. 


#### 3. Create a mysql engine to set the connection to the server. Check the connection details in [this link](https://relational.fit.cvut.cz/dataset/Stats).

#### 4. Import the users table.

In [3]:
users=pd.read_csv("../datasets/users.csv")

In [4]:
users.head()

Unnamed: 0.1,Unnamed: 0,userId,Reputation,Views,UpVotes,DownVotes
0,0,-1,1,0,5007,1920
1,1,2,101,25,3,0
2,2,3,101,22,19,0
3,3,4,101,11,0,0
4,4,5,6792,1145,662,5


#### 5. Rename Id column to userId.

In [5]:
users.rename({"Id":"userId"},inplace=True)
#when you use inplace, it overwrite the function without needing to save it in a variable

#### 6. Import the posts table. 

In [6]:
posts=pd.read_csv("../datasets/posts.csv")

In [7]:
posts.head()

Unnamed: 0.1,Unnamed: 0,PostId,userId,Score,ViewCount,CommentCount
0,0,1,8.0,23,1278.0,1
1,1,2,24.0,22,8198.0,1
2,2,3,18.0,54,3613.0,4
3,3,4,23.0,13,5224.0,2
4,4,5,23.0,81,,3


In [8]:
posts = posts.drop('Unnamed: 0', axis = 1)

In [28]:
posts.head()

Unnamed: 0,PostId,Score,userId,ViewCount,CommentCount
0,1,23,8.0,1278.0,1
1,2,22,24.0,8198.0,1
2,3,54,18.0,3613.0,4
3,4,13,23.0,5224.0,2
4,5,81,23.0,,3


#### 7. Rename Id column to postId and OwnerUserId to userId.

In [66]:
posts=posts.rename(columns={"PostId":"postId"})

#### 8. Define new dataframes for users and posts with the following selected columns:
**users_sliced columns**: userId, Reputation, Views, UpVotes, DownVotes  
**posts_sliced columns**: postId, Score, userId, ViewCount, CommentCount

In [73]:
usersc=["userId", "Reputation", "Views", "UpVotes", "DownVotes"]
postsc=["postId", "Score", "userId", "ViewCount", "CommentCount"]
#why doesn't it work when I use PostId but it does when I use postId?????
#It literally only works when using postId WHYYYY???

#shouldn't the function below overwrite the columns with the names I specified above?

In [74]:
#change all the column names for the ones specified in usersc and postsc

users_sliced=pd.DataFrame(users,columns=usersc)
posts_sliced=pd.DataFrame(posts,columns=postsc)

In [75]:
posts_sliced.head()

Unnamed: 0,postId,Score,userId,ViewCount,CommentCount
0,1,23,8.0,1278.0,1
1,2,22,24.0,8198.0,1
2,3,54,18.0,3613.0,4
3,4,13,23.0,5224.0,2
4,5,81,23.0,,3


In [32]:
#let's check if the names are changed

print("users:",users_sliced.columns.values.tolist())
print("posts:",posts_sliced.columns.values.tolist())

users: ['userId', 'Reputation', 'Views', 'UpVotes', 'DownVotes']
posts: ['postId', 'Score', 'userId', 'ViewCount', 'CommentCount']


In [33]:
users_sliced.head()

Unnamed: 0,userId,Reputation,Views,UpVotes,DownVotes
0,-1,1,0,5007,1920
1,2,101,25,3,0
2,3,101,22,19,0
3,4,101,11,0,0
4,5,6792,1145,662,5


In [81]:
posts_sliced["ViewCount"].count()
#print(posts.head())

42921

In [77]:
posts_sliced["postId"].value_counts()

4094     1
68276    1
4759     1
27288    1
25241    1
        ..
89409    1
83266    1
95556    1
97605    1
2049     1
Name: postId, Length: 91976, dtype: int64

In [83]:
posts_sliced["postId"].count()

91976

In [82]:
users_sliced["userId"].count()

40325

#### 9. Merge the two dataframes created in the step above (8), users_sliced and posts_sliced. 
You will need to make a [merge](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.merge.html) of posts and users dataframes.

In [78]:
Merge=users_sliced.merge(right=posts_sliced,on="userId" )
Merge.head(5)

Unnamed: 0,userId,Reputation,Views,UpVotes,DownVotes,postId,Score,ViewCount,CommentCount
0,-1,1,0,5007,1920,2175,0,,0
1,-1,1,0,5007,1920,8576,0,,0
2,-1,1,0,5007,1920,8578,0,,0
3,-1,1,0,5007,1920,8981,0,,0
4,-1,1,0,5007,1920,8982,0,,0


In [79]:
Merge["ViewCount"].value_counts()
#making sure it is not empty and the merge was successful

38.0       295
31.0       293
37.0       277
27.0       277
24.0       274
          ... 
25962.0      1
2586.0       1
2067.0       1
4443.0       1
3940.0       1
Name: ViewCount, Length: 3654, dtype: int64

In [84]:
Merge["ViewCount"].isnull()

0         True
1         True
2         True
3         True
4         True
         ...  
90579    False
90580    False
90581    False
90582    False
90583    False
Name: ViewCount, Length: 90584, dtype: bool

#### 10. How many missing values do you have in your merged dataframe? On which columns?

In [96]:
Merge.isnull().sum()
#there are 483896 missing values in ViewCount since there were only 42921 values in posts but there are 91976 postId

userId              0
Reputation          0
Views               0
UpVotes             0
DownVotes           0
PostId              0
Score               0
ViewCount       48396
CommentCount        0
dtype: int64

#### 11. You will need to make something with missing values.  Will you clean or filling them? Explain. 
**Remember** to check the results of your code before going to the next step.

In [85]:
# I think that it does not make sense to keep those users without view counts so I would eliminate them
Merge = Merge.dropna()


In [90]:
#let's reset the index

Merge=Merge.reset_index(drop=True)

Unnamed: 0,userId,Reputation,Views,UpVotes,DownVotes,postId,Score,ViewCount,CommentCount
0,5,6792,1145,662,5,6,152,29229.0,5
1,5,6792,1145,662,5,103,28,1990.0,6
2,5,6792,1145,662,5,125,75,29261.0,2
3,5,6792,1145,662,5,423,156,64481.0,7
4,5,6792,1145,662,5,562,10,1005.0,1
...,...,...,...,...,...,...,...,...,...
42183,55734,1,0,0,0,115352,0,16.0,0
42184,55738,11,0,0,0,115360,2,40.0,4
42185,55742,6,0,0,0,115366,1,17.0,0
42186,55744,6,1,0,0,115370,1,13.0,2


#### 12. Adjust the data types in order to avoid future issues. Which ones should be changed? 

In [91]:
Merge.dtypes  #ViewCount show be changed

userId            int64
Reputation        int64
Views             int64
UpVotes           int64
DownVotes         int64
postId            int64
Score             int64
ViewCount       float64
CommentCount      int64
dtype: object

In [94]:
Merge["ViewCount"]=Merge["ViewCount"].astype("int")

In [95]:
Merge.dtypes

userId          int64
Reputation      int64
Views           int64
UpVotes         int64
DownVotes       int64
postId          int64
Score           int64
ViewCount       int64
CommentCount    int64
dtype: object