# Data Cleaning 

#### 1. Import pandas library.

In [3]:
import pandas as pd

#### 2. Import pymysql and sqlalchemy as you have learnt in the lesson of importing/exporting data. 


In [4]:
import pymysql
import sqlalchemy 
from sqlalchemy import create_engine

#### 3. Create a mysql engine to set the connection to the server. Check the connection details in [this link](https://relational.fit.cvut.cz/dataset/Stats).

In [5]:
engine = sqlalchemy.create_engine('mysql+pymysql://guest:relational@relational.fit.cvut.cz:3306/stats')

#### 4. Import the users table.

In [6]:
users = pd.read_sql_table('users', engine)

#### 5. Rename Id column to userId.

In [7]:
users.rename(columns={'Id':'userId'}, inplace=True)

#### 6. Import the posts table. 

In [8]:
posts = pd.read_sql_table("posts", engine)

#### 7. Rename Id column to postId and OwnerUserId to userId.

In [9]:
# por qué si no guardo posts en una variable, no funciona
posts.rename(columns={'Id':'postId', 'OwnerUserId':'userId'}, inplace=True)

#### 8. Define new dataframes for users and posts with the following selected columns:
**users columns**: userId, Reputation, Views, UpVotes, DownVotes  
**posts columns**: postId, Score, userID, ViewCount, CommentCount

In [10]:
user_columns = pd.DataFrame(users[['userId', 'Reputation', 'Views', 'UpVotes', 'DownVotes']])
user_columns

Unnamed: 0,userId,Reputation,Views,UpVotes,DownVotes
0,-1,1,0,5007,1920
1,2,101,25,3,0
2,3,101,22,19,0
3,4,101,11,0,0
4,5,6792,1145,662,5
...,...,...,...,...,...
40320,55743,1,0,0,0
40321,55744,6,1,0,0
40322,55745,101,0,0,0
40323,55746,106,1,0,0


In [11]:
posts_columns = pd.DataFrame(posts[['postId', 'Score', 'userId', 'ViewCount', 'CommentCount']])
posts_columns

Unnamed: 0,postId,Score,userId,ViewCount,CommentCount
0,1,23,8.0,1278.0,1
1,2,22,24.0,8198.0,1
2,3,54,18.0,3613.0,4
3,4,13,23.0,5224.0,2
4,5,81,23.0,,3
...,...,...,...,...,...
91971,115374,2,805.0,,2
91972,115375,0,49365.0,9.0,0
91973,115376,1,55746.0,5.0,2
91974,115377,0,805.0,,0


#### 9. Merge the new dataframes you have created, of users and posts. 
You will need to make an inner [merge](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.merge.html) of posts and users dataframes.

In [12]:
merged_df = pd.merge(user_columns,posts_columns,on='userId',how='inner')
merged_df

Unnamed: 0,userId,Reputation,Views,UpVotes,DownVotes,postId,Score,ViewCount,CommentCount
0,-1,1,0,5007,1920,2175,0,,0
1,-1,1,0,5007,1920,8576,0,,0
2,-1,1,0,5007,1920,8578,0,,0
3,-1,1,0,5007,1920,8981,0,,0
4,-1,1,0,5007,1920,8982,0,,0
...,...,...,...,...,...,...,...,...,...
90579,55734,1,0,0,0,115352,0,16.0,0
90580,55738,11,0,0,0,115360,2,40.0,4
90581,55742,6,0,0,0,115366,1,17.0,0
90582,55744,6,1,0,0,115370,1,13.0,2


#### 10. How many missing values do you have in your merged dataframe? On which columns?

In [13]:
user_columns.shape

(40325, 5)

In [14]:
posts_columns.shape

(91976, 5)

In [None]:
#The missing values are from userId column (when a value appears in userId column from one DF but not from the other)

#### 11. You will need to make something with missing values.  Will you clean or filling them? Explain. 
**Remember** to check the results of your code before going to the next step.

In [None]:
#I'll fill them, 'cause if I choose to clean them it may affect the stats of the column.

In [16]:
# para saber el nombre de las columnas con valores nulos
na_values = merged_df.columns[merged_df.isnull().any()]
na_values

Index(['ViewCount'], dtype='object')

In [21]:
merged_df.fillna(value={'ViewCount': 0}, inplace=True)
#inplace=True porque si no, no se puede cambiar el tipo de datos de la columna (ejercicio 12)

#### 12. Adjust the data types in order to avoid future issues. Which ones should be changed? 

In [18]:
merged_df.dtypes.to_list()

[dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('float64'),
 dtype('int64')]

In [22]:
merged_df['ViewCount'].astype(int
#int porque no hay algo como 0,2 visitas

0         0
1         0
2         0
3         0
4         0
         ..
90579    16
90580    40
90581    17
90582    13
90583     5
Name: ViewCount, Length: 90584, dtype: int64