# Data Cleaning 

#### 1. Import pandas library.

In [34]:
import pandas as pd

#### 2. Import pymysql and sqlalchemy as you have learnt in the lesson of importing/exporting data. 


In [35]:
import pymysql
from sqlalchemy import create_engine

#### 3. Create a mysql engine to set the connection to the server. Check the connection details in [this link](https://relational.fit.cvut.cz/dataset/Stats).

In [36]:
# hostname: relational.fit.cvut.cz
# port: 3306
# username: guest
# password: relational

engine=create_engine('mysql+pymysql://guest:relational@relational.fit.cvut.cz:3306/stats')


#### 4. Import the users table.

In [37]:
user=pd.read_sql_query('SELECT * From users',engine)



#### 5. Rename Id column to userId.

In [38]:
new_user = user.rename(columns = {'Id': 'userId'} )



#### 6. Import the posts table. 

In [39]:
posts=pd.read_sql_query('SELECT * From posts',engine)



#### 7. Rename Id column to postId and OwnerUserId to userId.

In [44]:
new_posts=posts.rename(columns = {'Id':'postId','OwnerUserId':'userId'})



#### 8. Define new dataframes for users and posts with the following selected columns:
**users columns**: userId, Reputation, Views, UpVotes, DownVotes  
**posts columns**: postId, Score, userID, ViewCount, CommentCount

In [45]:
user2=new_user[['userId', 'Reputation', 'Views', 'UpVotes', 'DownVotes']]
posts2=new_posts[['postId', 'Score', 'userId', 'ViewCount', 'CommentCount']]

user2
posts2

Unnamed: 0,postId,Score,userId,ViewCount,CommentCount
0,1,23,8.0,1278.0,1
1,2,22,24.0,8198.0,1
2,3,54,18.0,3613.0,4
3,4,13,23.0,5224.0,2
4,5,81,23.0,,3
...,...,...,...,...,...
91971,115374,2,805.0,,2
91972,115375,0,49365.0,9.0,0
91973,115376,1,55746.0,5.0,2
91974,115377,0,805.0,,0


#### 9. Merge the new dataframes you have created, of users and posts. 
You will need to make an inner [merge](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.merge.html) of posts and users dataframes.

In [97]:
# pd.DataFrame.merge(self,right,how='inner')

new=user2.merge(posts2,left_on='userId',right_on='userId')
new

Unnamed: 0,userId,Reputation,Views,UpVotes,DownVotes,postId,Score,ViewCount,CommentCount
0,-1,1,0,5007,1920,2175,0,,0
1,-1,1,0,5007,1920,8576,0,,0
2,-1,1,0,5007,1920,8578,0,,0
3,-1,1,0,5007,1920,8981,0,,0
4,-1,1,0,5007,1920,8982,0,,0
...,...,...,...,...,...,...,...,...,...
90579,55734,1,0,0,0,115352,0,16.0,0
90580,55738,11,0,0,0,115360,2,40.0,4
90581,55742,6,0,0,0,115366,1,17.0,0
90582,55744,6,1,0,0,115370,1,13.0,2


#### 10. How many missing values do you have in your merged dataframe? On which columns?

In [98]:

# pd.DataFrame.count(axis=1)

# print('The merged dataframe has rows: '+ str(len(new.index))+', has columns: '+ str(len(new.columns)))
# print('The user dataframe has rows: '+str(len(user2.index))+', has columns: '+ str(len(user2.columns)))
# print('The posts dataframe has rows: '+str(len(posts2.index))+', has columns: '+ str(len(posts2.columns)))


# print(str(len(posts2.index)+len(user2.index)-len(new.index))+' rows of data missed')


null=new.isnull().sum()
print(null)


print('The value in the column ViewCount get missed ')


userId              0
Reputation          0
Views               0
UpVotes             0
DownVotes           0
postId              0
Score               0
ViewCount       48396
CommentCount        0
dtype: int64
The value in the column ViewCount get missed 


#### 11. You will need to make something with missing values.  Will you clean or filling them? Explain. 
**Remember** to check the results of your code before going to the next step.

In [100]:


null_ViewCount=new[(new['ViewCount'].isnull()==True)]
null_ViewCount.head(50)

#It shows that the value of ViewCount is consistent with the value of Views=0, choice: filling with 0'


new['ViewCount']=new['ViewCount'].fillna(0)

new

Unnamed: 0,userId,Reputation,Views,UpVotes,DownVotes,postId,Score,ViewCount,CommentCount
0,-1,1,0,5007,1920,2175,0,0.0,0
1,-1,1,0,5007,1920,8576,0,0.0,0
2,-1,1,0,5007,1920,8578,0,0.0,0
3,-1,1,0,5007,1920,8981,0,0.0,0
4,-1,1,0,5007,1920,8982,0,0.0,0
...,...,...,...,...,...,...,...,...,...
90579,55734,1,0,0,0,115352,0,16.0,0
90580,55738,11,0,0,0,115360,2,40.0,4
90581,55742,6,0,0,0,115366,1,17.0,0
90582,55744,6,1,0,0,115370,1,13.0,2


#### 12. Adjust the data types in order to avoid future issues. Which ones should be changed? 

In [103]:
new['ViewCount']=new['ViewCount'].astype('int64')
new.dtypes

userId          int64
Reputation      int64
Views           int64
UpVotes         int64
DownVotes       int64
postId          int64
Score           int64
ViewCount       int64
CommentCount    int64
dtype: object