# Data Cleaning 

#### 1. Import pandas library.

In [11]:
import pandas as pd

#### 2. Import pymysql and sqlalchemy as you have learnt in the lesson of importing/exporting data. 


In [1]:
import pymysql
import sqlalchemy as db

#### 3. Create a mysql engine to set the connection to the server. Check the connection details in [this link](https://relational.fit.cvut.cz/dataset/Stats).

In [3]:
from sqlalchemy import create_engine
engine = create_engine('mysql+mysqlconnector://guest:relational@relational.fit.cvut.cz:3306/stats')
connection = engine.connect()
metadata = db.MetaData()

#### 4. Import the users table.

In [33]:
db.Table('users', metadata ,autoload=True, autoload_with=engine)
users = pd.read_sql_table('users',connection)

#### 5. Rename Id column to userId.

In [41]:
headers = list(users.columns.values)
headers[0] = 'userId'
users.columns = headers
headers

['userId',
 'Reputation',
 'CreationDate',
 'DisplayName',
 'LastAccessDate',
 'WebsiteUrl',
 'Location',
 'AboutMe',
 'Views',
 'UpVotes',
 'DownVotes',
 'AccountId',
 'Age',
 'ProfileImageUrl']

#### 6. Import the posts table. 

In [42]:
db.Table('posts', metadata ,autoload=True, autoload_with=engine)
posts = pd.read_sql_table('posts',connection)

#### 7. Rename Id column to postId and OwnerUserId to userId.

In [43]:
headers2 = list(posts.columns.values)
headers2[7] = 'userId'
posts.columns = headers2
headers2

['Id',
 'PostTypeId',
 'AcceptedAnswerId',
 'CreaionDate',
 'Score',
 'ViewCount',
 'Body',
 'userId',
 'LasActivityDate',
 'Title',
 'Tags',
 'AnswerCount',
 'CommentCount',
 'FavoriteCount',
 'LastEditorUserId',
 'LastEditDate',
 'CommunityOwnedDate',
 'ParentId',
 'ClosedDate',
 'OwnerDisplayName',
 'LastEditorDisplayName']

#### 8. Define new dataframes for users and posts with the following selected columns:
**users columns**: userId, Reputation, Views, UpVotes, DownVotes  
**posts columns**: postId, Score, userID, ViewCount, CommentCount

In [46]:
df_users = pd.DataFrame(users, columns=['userId', 'Reputation', 'Views', 'UpVotes', 'DownVotes'])
df_posts = pd.DataFrame(posts, columns=['Id', 'postId', 'Score', 'userID', 'ViewCount', 'CommentCount'])


#### 9. Merge the new dataframes you have created, of users and posts. 
You will need to make an inner [merge](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.merge.html) of posts and users dataframes.

In [47]:
merged_df = df_users.merge(df_posts, how='inner', left_on='userId', right_on='Id')
merged_df

Unnamed: 0,userId,Reputation,Views,UpVotes,DownVotes,Id,postId,Score,userID,ViewCount,CommentCount
0,2,101,25,3,0,2,,22,,8198.0,1
1,3,101,22,19,0,3,,54,,3613.0,4
2,4,101,11,0,0,4,,13,,5224.0,2
3,5,6792,1145,662,5,5,,81,,,3
4,6,457,114,47,0,6,,152,,29229.0,5
...,...,...,...,...,...,...,...,...,...,...,...
32052,55743,1,0,0,0,55743,,0,,,1
32053,55744,6,1,0,0,55744,,1,,,7
32054,55745,101,0,0,0,55745,,3,,,2
32055,55746,106,1,0,0,55746,,3,,59.0,0


#### 10. How many missing values do you have in your merged dataframe? On which columns?

In [48]:
merged_df.isnull().sum()

userId              0
Reputation          0
Views               0
UpVotes             0
DownVotes           0
Id                  0
postId          32057
Score               0
userID          32057
ViewCount       19011
CommentCount        0
dtype: int64

#### 11. You will need to make something with missing values.  Will you clean or filling them? Explain. 
**Remember** to check the results of your code before going to the next step.

In [51]:
# Since the na count is so high, I would remove the affected columns postID and UserID
clean_merged = merged_df.drop(['postId', 'userID'], axis=1)
clean_merged

Unnamed: 0,userId,Reputation,Views,UpVotes,DownVotes,Id,Score,ViewCount,CommentCount
0,2,101,25,3,0,2,22,8198.0,1
1,3,101,22,19,0,3,54,3613.0,4
2,4,101,11,0,0,4,13,5224.0,2
3,5,6792,1145,662,5,5,81,,3
4,6,457,114,47,0,6,152,29229.0,5
...,...,...,...,...,...,...,...,...,...
32052,55743,1,0,0,0,55743,0,,1
32053,55744,6,1,0,0,55744,1,,7
32054,55745,101,0,0,0,55745,3,,2
32055,55746,106,1,0,0,55746,3,59.0,0


#### 12. Adjust the data types in order to avoid future issues. Which ones should be changed? 