# Data Cleaning 

#### 1. Import pandas library.

In [15]:
import pandas as pd

#### 2. Import pymysql and sqlalchemy as you have learnt in the lesson of importing/exporting data. 


In [16]:
from sqlalchemy import create_engine

In [17]:
import pymysql.cursors

#### 3. Create a mysql engine to set the connection to the server. Check the connection details in [this link](https://relational.fit.cvut.cz/dataset/Stats).

In [18]:
con = pymysql.connect('relational.fit.cvut.cz', 'guest', 'relational', 'stats')

try:

    with con.cursor() as cur:

        cur.execute('SELECT VERSION()')

        version = cur.fetchone()

        print(f'Database version: {version[0]}')

finally:

    con.close()

Database version: 10.3.15-MariaDB-log


#### 4. Import the users table.

In [19]:
users_table=pd.read_csv("../data/users.csv" , index_col=0)

users_table


Unnamed: 0,userId,Reputation,Views,UpVotes,DownVotes
0,-1,1,0,5007,1920
1,2,101,25,3,0
2,3,101,22,19,0
3,4,101,11,0,0
4,5,6792,1145,662,5
...,...,...,...,...,...
40320,55743,1,0,0,0
40321,55744,6,1,0,0
40322,55745,101,0,0,0
40323,55746,106,1,0,0


#### 5. Rename Id column to userId.

In [20]:
users_table.rename(columns={"userId": "Id"},inplace=True)


users_table


Unnamed: 0,Id,Reputation,Views,UpVotes,DownVotes
0,-1,1,0,5007,1920
1,2,101,25,3,0
2,3,101,22,19,0
3,4,101,11,0,0
4,5,6792,1145,662,5
...,...,...,...,...,...
40320,55743,1,0,0,0
40321,55744,6,1,0,0
40322,55745,101,0,0,0
40323,55746,106,1,0,0


#### 6. Import the posts table. 

In [21]:
posts_table = pd.read_csv("../data/posts.csv",index_col=0)

posts_table

Unnamed: 0,PostId,userId,Score,ViewCount,CommentCount
0,1,8.0,23,1278.0,1
1,2,24.0,22,8198.0,1
2,3,18.0,54,3613.0,4
3,4,23.0,13,5224.0,2
4,5,23.0,81,,3
...,...,...,...,...,...
91971,115374,805.0,2,,2
91972,115375,49365.0,0,9.0,0
91973,115376,55746.0,1,5.0,2
91974,115377,805.0,0,,0


#### 7. Rename Id column to postId and OwnerUserId to userId.

In [22]:
posts_table.rename(columns={"PostId": "Id", "userId":"OwnerUserId"},inplace=True)

posts_table

Unnamed: 0,Id,OwnerUserId,Score,ViewCount,CommentCount
0,1,8.0,23,1278.0,1
1,2,24.0,22,8198.0,1
2,3,18.0,54,3613.0,4
3,4,23.0,13,5224.0,2
4,5,23.0,81,,3
...,...,...,...,...,...
91971,115374,805.0,2,,2
91972,115375,49365.0,0,9.0,0
91973,115376,55746.0,1,5.0,2
91974,115377,805.0,0,,0


#### 8. Define new dataframes for users and posts with the following selected columns:
**users_sliced columns**: userId, Reputation, Views, UpVotes, DownVotes  
**posts_sliced columns**: postId, Score, userId, ViewCount, CommentCount

In [23]:
users_sliced_columns = ["userld", "Reputation", "Views","UpVotes","DownVotes"]

users_table.columns = users_sliced_columns

users_table

Unnamed: 0,userld,Reputation,Views,UpVotes,DownVotes
0,-1,1,0,5007,1920
1,2,101,25,3,0
2,3,101,22,19,0
3,4,101,11,0,0
4,5,6792,1145,662,5
...,...,...,...,...,...
40320,55743,1,0,0,0
40321,55744,6,1,0,0
40322,55745,101,0,0,0
40323,55746,106,1,0,0


In [24]:
posts_sliced_columns=["postId", "Score", "userId", "ViewCount", "CommentCount"]

posts_table.columns=posts_sliced_columns

posts_table

Unnamed: 0,postId,Score,userId,ViewCount,CommentCount
0,1,8.0,23,1278.0,1
1,2,24.0,22,8198.0,1
2,3,18.0,54,3613.0,4
3,4,23.0,13,5224.0,2
4,5,23.0,81,,3
...,...,...,...,...,...
91971,115374,805.0,2,,2
91972,115375,49365.0,0,9.0,0
91973,115376,55746.0,1,5.0,2
91974,115377,805.0,0,,0


#### 9. Merge the two dataframes created in the step above (8), users_sliced and posts_sliced. 
You will need to make a [merge](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.merge.html) of posts and users dataframes.

In [25]:
users_posts_sliced = users_table.merge(posts_table, left_on='userld', right_on='postId')

users_posts_sliced

Unnamed: 0,userld,Reputation,Views,UpVotes,DownVotes,postId,Score,userId,ViewCount,CommentCount
0,2,101,25,3,0,2,24.0,22,8198.0,1
1,3,101,22,19,0,3,18.0,54,3613.0,4
2,4,101,11,0,0,4,23.0,13,5224.0,2
3,5,6792,1145,662,5,5,23.0,81,,3
4,6,457,114,47,0,6,5.0,152,29229.0,5
...,...,...,...,...,...,...,...,...,...,...
32052,55743,1,0,0,0,55743,24164.0,0,,1
32053,55744,6,1,0,0,55744,10961.0,1,,7
32054,55745,101,0,0,0,55745,4598.0,3,,2
32055,55746,106,1,0,0,55746,20315.0,3,59.0,0


#### 10. How many missing values do you have in your merged dataframe? On which columns?

In [26]:
users_posts_sliced.info()
users_table.info()
posts_table.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32057 entries, 0 to 32056
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   userld        32057 non-null  int64  
 1   Reputation    32057 non-null  int64  
 2   Views         32057 non-null  int64  
 3   UpVotes       32057 non-null  int64  
 4   DownVotes     32057 non-null  int64  
 5   postId        32057 non-null  int64  
 6   Score         31285 non-null  float64
 7   userId        32057 non-null  int64  
 8   ViewCount     13046 non-null  float64
 9   CommentCount  32057 non-null  int64  
dtypes: float64(2), int64(8)
memory usage: 2.7 MB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 40325 entries, 0 to 40324
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype
---  ------      --------------  -----
 0   userld      40325 non-null  int64
 1   Reputation  40325 non-null  int64
 2   Views       40325 non-null  int64
 3   UpVotes     40325

In [28]:
users_posts_sliced.shape

(32057, 10)

In [29]:
users_table.shape

(40325, 5)

In [30]:
posts_table.shape

(91976, 5)

#### 11. You will need to make something with missing values.  Will you clean or filling them? Explain. 
**Remember** to check the results of your code before going to the next step.

#### 12. Adjust the data types in order to avoid future issues. Which ones should be changed? 