#### 1. Import pandas library

In [1]:
import pandas as pd

#### 2. Import pymysql and sqlalchemy as you have learnt in the lesson of importing/exporting data 


In [5]:
from sqlalchemy import create_engine
import pymysql

#### 3. Create a mysql engine to set the connection to the server. Check the connection details in [this link](https://relational.fit.cvut.cz/search?tableCount%5B%5D=0-10&tableCount%5B%5D=10-30&dataType%5B%5D=Numeric&databaseSize%5B%5D=KB&databaseSize%5B%5D=MB)

In [7]:
engine=create_engine('mysql+pymysql://guest:relational@relational.fit.cvut.cz/stats')

#### 4. Import the users table 

In [19]:
#Enviamos una query a un pd DF
data=pd.read_sql_query('SELECT * FROM stats.users',engine)
#data.head()

#### 5. Rename Id column to userId

In [20]:
data.rename(columns={'Id':'userId'}, inplace=True)
#data.head()

#### 6. Import the posts table. 

In [21]:
data2=pd.read_sql_query('SELECT * FROM stats.posts',engine)

#### 7. Rename Id column to postId and OwnerUserId to userId

In [24]:
data2.rename(columns={'Id':'postId','OwnerUserId':'userId'}, inplace=True)
#data2.columns

Index(['postId', 'PostTypeId', 'AcceptedAnswerId', 'CreaionDate', 'Score',
       'ViewCount', 'Body', 'userId', 'LasActivityDate', 'Title', 'Tags',
       'AnswerCount', 'CommentCount', 'FavoriteCount', 'LastEditorUserId',
       'LastEditDate', 'CommunityOwnedDate', 'ParentId', 'ClosedDate',
       'OwnerDisplayName', 'LastEditorDisplayName'],
      dtype='object')

#### 8. Define new dataframes for users and posts with the following selected columns:
    **users columns**: userId, Reputation,Views,UpVotes,DownVotes
    **posts columns**: postId, Score,userID,ViewCount,CommentCount

In [27]:
users_df = data[['userId', 'Reputation','Views','UpVotes','DownVotes']]
posts_df = data2[['postId', 'Score','userId','ViewCount','CommentCount']]

In [72]:
users_df.shape

(40325, 5)

In [29]:
posts_df.shape

(91976, 5)

#### 8. Merge both dataframes, users and posts. 
You will need to make a [merge](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.merge.html) of posts and users dataframes.

In [77]:
master_df=posts_df.merge(users_df,how='inner',on='userId')

In [78]:
master_df.shape

(90584, 9)

In [79]:
master_df.head()

Unnamed: 0,postId,Score,userId,ViewCount,CommentCount,Reputation,Views,UpVotes,DownVotes
0,1,23,8.0,1278.0,1,6764,1089,604,25
1,16,16,8.0,,3,6764,1089,604,25
2,36,41,8.0,67396.0,7,6764,1089,604,25
3,65,14,8.0,,3,6764,1089,604,25
4,78,33,8.0,,4,6764,1089,604,25


#### 9. How many missing values do you have in your merged dataframe? On which columns?

In [80]:
master_df.isnull().sum()

postId              0
Score               0
userId              0
ViewCount       48396
CommentCount        0
Reputation          0
Views               0
UpVotes             0
DownVotes           0
dtype: int64

#### 10. You will need to make something with missing values.  Will you clean or filling them? Explain. 
**Remember** to check the results of your code before passing to the next step

In [81]:
print("I'll remove the whole column to clean the missing values since there are more than half of th total registers")
master_df = master_df.drop('ViewCount',axis = 1)

I'll remove the whole column to clean the missing values since there are more than half of th total registers


In [82]:
master_df.head()

Unnamed: 0,postId,Score,userId,CommentCount,Reputation,Views,UpVotes,DownVotes
0,1,23,8.0,1,6764,1089,604,25
1,16,16,8.0,3,6764,1089,604,25
2,36,41,8.0,7,6764,1089,604,25
3,65,14,8.0,3,6764,1089,604,25
4,78,33,8.0,4,6764,1089,604,25


In [68]:
master_df.isnull().sum()

postId          0
Score           0
userId          0
CommentCount    0
Reputation      0
Views           0
UpVotes         0
DownVotes       0
dtype: int64

#### 11. Adjust the data types in order to avoid future issues. Which ones should be changed? 

In [51]:
#Ves los tipos
master_df.dtypes

postId            int64
Score             int64
userId          float64
CommentCount      int64
Reputation        int64
Views             int64
UpVotes           int64
DownVotes         int64
dtype: object

In [83]:
#Cambias la columna userId por los valores de userId pero en tipo int64 (entero)
master_df['userId'] = master_df['userId'].astype('int64')

In [84]:
master_df.head()

Unnamed: 0,postId,Score,userId,CommentCount,Reputation,Views,UpVotes,DownVotes
0,1,23,8,1,6764,1089,604,25
1,16,16,8,3,6764,1089,604,25
2,36,41,8,7,6764,1089,604,25
3,65,14,8,3,6764,1089,604,25
4,78,33,8,4,6764,1089,604,25


#### Bonus: Identify extreme values in your merged dataframe as you have learned in class, create a dataframe called outliers with the same columns as our data set and calculate the bounds. The values of the outliers dataframe will be the values of the merged_df that fall outside that bounds. You will need to save your outliers dataframe to a csv file on your-code folder.

In [86]:
#creamos el analisis estadistico descriptivo
stats=master_df.describe().T
#Calculamos el rango intercuartil y lo agregamos al df de stats
stats['IQR']=stats['75%']-stats['25%']

In [90]:
#Creo DF vacio con las mismas columnas que el master
outliers = pd.DataFrame(columns=master_df.columns)
#Itero en los indices q son = a las columnas de mi master para iterar en los valores de IQR
for row in stats.index:
    iqr = stats.at[row, 'IQR']# at es parecido a loc, busca en la fila el valor del rango iqr para cada col de master_df
    cutoff = iqr * 1.5 #Mis outliers seranaquellos que sean diferentes mas de 1.5 veces al rango intercuartil
    lower = stats.at[row,'25%'] - cutoff #defino el valor minimo
    upper = stats.at[row,'75%'] + cutoff #defino valor max
    results = master_df[(master_df[row] < lower) | (master_df[col] > upper)].copy() #fíltrame todo lo que cumpla con cualquiera
    #de las condiciones, | es “or”
    results['Outlier'] = row
    outliers = outliers.append(results)

In [92]:
outliers.head()

Unnamed: 0,postId,Score,userId,CommentCount,Reputation,Views,UpVotes,DownVotes,Outlier
0,1,23,8,1,6764,1089,604,25,Score
1,16,16,8,3,6764,1089,604,25,Score
2,36,41,8,7,6764,1089,604,25,Score
3,65,14,8,3,6764,1089,604,25,Score
4,78,33,8,4,6764,1089,604,25,Score
