#### 1. Import pandas library

In [1]:
import pandas as pd

#### 2. Import pymysql and sqlalchemy as you have learnt in the lesson of importing/exporting data 


In [2]:
import pymysql
from sqlalchemy import create_engine

#### 3. Create a mysql engine to set the connection to the server. Check the connection details in [this link](https://relational.fit.cvut.cz/search?tableCount%5B%5D=0-10&tableCount%5B%5D=10-30&dataType%5B%5D=Numeric&databaseSize%5B%5D=KB&databaseSize%5B%5D=MB)

In [3]:
engine = create_engine('mysql+pymysql://guest:relational@relational.fit.cvut.cz/stats')

#### 4. Import the users table 

In [4]:
data_users = pd.read_sql_query('SELECT * FROM stats.users', engine)

#### 5. Rename Id column to userId

In [5]:
data_users = data_users.rename(columns={'Id':'userId'})

#### 6. Import the posts table. 

In [6]:
data_posts = pd.read_sql_query('SELECT * FROM stats.posts', engine)

#### 7. Rename Id column to postId and OwnerUserId to userId

In [7]:
data_posts = data_posts.rename(columns={'Id':'postId',
                                       'OwnerUserId':'userId'})

#### 8. Define new dataframes for users and posts with the following selected columns:
    **users columns**: userId, Reputation,Views,UpVotes,DownVotes
    **posts columns**: postId, Score,userID,ViewCount,CommentCount

In [8]:
data_users = data_users[['userId','Reputation','Views','UpVotes','DownVotes']]
data_posts = data_posts[['postId','Score','userId','ViewCount','CommentCount']]

#### 8. Merge both dataframes, users and posts. 
You will need to make a [merge](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.merge.html) of posts and users dataframes.

In [9]:
data = pd.merge(data_users, data_posts, on='userId')
data.head()

Unnamed: 0,userId,Reputation,Views,UpVotes,DownVotes,postId,Score,ViewCount,CommentCount
0,-1,1,0,5007,1920,2175,0,,0
1,-1,1,0,5007,1920,8576,0,,0
2,-1,1,0,5007,1920,8578,0,,0
3,-1,1,0,5007,1920,8981,0,,0
4,-1,1,0,5007,1920,8982,0,,0


#### 9. How many missing values do you have in your merged dataframe? On which columns?

In [10]:
#Counting missing values - 'ViewCount' column has missing values.
data.isna().sum()

userId              0
Reputation          0
Views               0
UpVotes             0
DownVotes           0
postId              0
Score               0
ViewCount       48396
CommentCount        0
dtype: int64

#### 10. You will need to make something with missing values.  Will you clean or filling them? Explain. 
**Remember** to check the results of your code before passing to the next step

In [11]:
#I will fill the empty values with 0, as those videos have '0' views.

data['ViewCount'] = data['ViewCount'].fillna(0)
data

Unnamed: 0,userId,Reputation,Views,UpVotes,DownVotes,postId,Score,ViewCount,CommentCount
0,-1,1,0,5007,1920,2175,0,0.0,0
1,-1,1,0,5007,1920,8576,0,0.0,0
2,-1,1,0,5007,1920,8578,0,0.0,0
3,-1,1,0,5007,1920,8981,0,0.0,0
4,-1,1,0,5007,1920,8982,0,0.0,0
5,-1,1,0,5007,1920,9857,0,0.0,0
6,-1,1,0,5007,1920,9858,0,0.0,0
7,-1,1,0,5007,1920,9860,0,0.0,0
8,-1,1,0,5007,1920,10130,0,0.0,0
9,-1,1,0,5007,1920,10131,0,0.0,0


#### 11. Adjust the data types in order to avoid future issues. Which ones should be changed? 

In [12]:
#Changing the data type from 'float64' to 'int64' will solve the issue, as there are not partial views.
data['ViewCount'] = data['ViewCount'].astype('int64')
data.dtypes

userId          int64
Reputation      int64
Views           int64
UpVotes         int64
DownVotes       int64
postId          int64
Score           int64
ViewCount       int64
CommentCount    int64
dtype: object

#### Bonus: Identify extreme values in your merged dataframe as you have learned in class, create a dataframe called outliers with the same columns as our data set and calculate the bounds. The values of the outliers dataframe will be the values of the merged_df that fall outside that bounds. You will need to save your outliers dataframe to a csv file on your-code folder.

In [13]:
stats = data.describe().transpose()
stats['IQR'] = stats['75%'] - stats['25%']
stats

Unnamed: 0,count,mean,std,min,25%,50%,75%,max,IQR
userId,90584.0,16546.764727,15273.367108,-1.0,3437.0,11032.0,27700.0,55746.0,24263.0
Reputation,90584.0,6282.395412,15102.26867,1.0,60.0,396.0,4460.0,87393.0,4400.0
Views,90584.0,1034.245176,2880.074012,0.0,5.0,45.0,514.25,20932.0,509.25
UpVotes,90584.0,734.315718,2050.869327,0.0,1.0,22.0,283.0,11442.0,282.0
DownVotes,90584.0,33.273249,134.936435,0.0,0.0,0.0,8.0,1920.0,8.0
postId,90584.0,56539.080522,33840.307529,1.0,26051.75,57225.5,86145.25,115378.0,60093.5
Score,90584.0,2.780767,4.948922,-19.0,1.0,2.0,3.0,192.0,2.0
ViewCount,90584.0,259.2534,1632.261405,0.0,0.0,0.0,111.0,175495.0,111.0
CommentCount,90584.0,1.89465,2.638704,0.0,0.0,1.0,3.0,45.0,3.0


In [15]:
outliers = pd.DataFrame(columns=data.columns)

for col in stats.index:
    iqr = stats.at[col,'IQR']
    cutoff = iqr * 3
    lower = stats.at[col,'25%'] - cutoff
    upper = stats.at[col,'75%'] + cutoff
    results = data[(data[col] < lower) | 
                   (data[col] > upper)].copy()
    results['Outlier'] = col
    outliers = outliers.append(results)
    
outliers

Unnamed: 0,CommentCount,DownVotes,Outlier,Reputation,Score,UpVotes,ViewCount,Views,postId,userId
1733,4,59,Reputation,18283,10,1014,0,3781,167,159
1734,0,59,Reputation,18283,2,1014,0,3781,169,159
1735,0,59,Reputation,18283,8,1014,0,3781,171,159
1736,0,59,Reputation,18283,35,1014,0,3781,174,159
1737,2,59,Reputation,18283,22,1014,0,3781,177,159
1738,0,59,Reputation,18283,12,1014,0,3781,179,159
1739,0,59,Reputation,18283,50,1014,39097,3781,181,159
1740,0,59,Reputation,18283,6,1014,0,3781,184,159
1741,3,59,Reputation,18283,51,1014,9772,3781,213,159
1742,2,59,Reputation,18283,15,1014,0,3781,221,159


In [18]:
data.to_csv('./outliers.csv', index=False) 