#### 1. Import pandas library

In [1]:
import pandas as pd

#### 2. Import pymysql and sqlalchemy as you have learnt in the lesson of importing/exporting data 


In [2]:
import pymysql
from sqlalchemy import create_engine

#### 3. Create a mysql engine to set the connection to the server. Check the connection details in [this link](https://relational.fit.cvut.cz/dataset/Stats)

In [3]:
engine = create_engine('mysql+pymysql://guest:relational@relational.fit.cvut.cz:3306')

#### 4. Import the users table 

In [None]:
data = pd.read_sql_query(
"""
SELECT * 
FROM stats.users"""
, engine)

data.head()

#### 5. Rename Id column to userId

In [None]:
cols = ", ".join([col for col in data.columns[1:]])

query = f""" 
SELECT Id AS userId, {cols}
FROM stats.users"""
users = pd.read_sql_query(query, engine)
users.head()

#### 6. Import the posts table. 

In [None]:
posts = pd.read_sql_query(
"""
SELECT * 
FROM stats.posts"""
, engine)

posts.head()

#### 7. Rename Id column to postId and OwnerUserId to userId

In [None]:
posts = posts.rename(columns={'Id':'postId', 'OwnerUserId':'userId'})
posts

#### 8. Define new dataframes for users and posts with the following selected columns:
    **users columns**: userId, Reputation,Views,UpVotes,DownVotes
    **posts columns**: postId, Score,userID,ViewCount,CommentCount

In [26]:
new_users = users[['userId', 'Reputation','Views', 'UpVotes','DownVotes']].copy()
new_posts = posts[['postId', 'Score', 'userId', 'ViewCount', 'CommentCount']].copy()

#### 8. Merge both dataframes, users and posts. 
You will need to make a [merge](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.merge.html) of posts and users dataframes.

In [None]:
data = pd.merge(new_users, new_posts, on='userId')
data

#### 9. How many missing values do you have in your merged dataframe? On which columns?

In [46]:
null_cols = data.isnull().sum()
list(null_cols[null_cols > 0].index)

['ViewCount']

#### 10. You will need to make something with missing values.  Will you clean or filling them? Explain. 
**Remember** to check the results of your code before passing to the next step

In [None]:
# The NaN values were filled with 0's because those empty values on the 'ViewCount' column mean that there are 
# no views (0) for that user

data['ViewCount'] = data['ViewCount'].fillna(0)
data.drop_duplicates()

#### 11. Adjust the data types in order to avoid future issues. Which ones should be changed? 

In [52]:
data.dtypes
data = data.astype({"ViewCount":'int64'})

userId          int64
Reputation      int64
Views           int64
UpVotes         int64
DownVotes       int64
postId          int64
Score           int64
ViewCount       int64
CommentCount    int64
dtype: object

#### Bonus: Identify extreme values in your merged dataframe as you have learned in class, create a dataframe called outliers with the same columns as our data set and calculate the bounds. The values of the outliers dataframe will be the values of the merged_df that fall outside that bounds. You will need to save your outliers dataframe to a csv file on your-code folder.

In [None]:
stats = data.describe().transpose()
stats['IQR'] = stats['75%'] - stats['25%']

outliers = pd.DataFrame(columns=data.columns)

for col in stats.index:
    iqr = stats.at[col,'IQR']
    cutoff = iqr * 1.5
    lower = stats.at[col,'25%'] - cutoff
    upper = stats.at[col,'75%'] + cutoff
    #print(col, iqr, cutoff, lower, upper)
    results = data[(data[col] < lower) |
                   (data[col] > upper)].copy()
    results['Outlier'] = col
    outliers = outliers.append(results)
    
outliers