#### 1. Import pandas library

In [211]:
import pandas as pd
import numpy as np
from scipy import stats

#### 2. Import users table:

In [212]:
users = pd.read_csv('users_table.csv')
users.shape

(40325, 14)

#### 3. Rename Id column to userId

In [213]:
users.rename(columns={'Id': 'userId'}, inplace= True)
users.dtypes

userId               int64
Reputation           int64
CreationDate        object
DisplayName         object
LastAccessDate      object
WebsiteUrl          object
Location            object
AboutMe             object
Views                int64
UpVotes              int64
DownVotes            int64
AccountId            int64
Age                float64
ProfileImageUrl     object
dtype: object

#### 4. Import posts table:

In [214]:
posts = pd.read_csv('posts_table.csv')
posts.dtypes

Id                         int64
PostTypeId                 int64
AcceptedAnswerId         float64
CreaionDate               object
Score                      int64
ViewCount                float64
Body                      object
OwnerUserId              float64
LasActivityDate           object
Title                     object
Tags                      object
AnswerCount              float64
CommentCount               int64
FavoriteCount            float64
LastEditorUserId         float64
LastEditDate              object
CommunityOwnedDate        object
ParentId                 float64
ClosedDate                object
OwnerDisplayName          object
LastEditorDisplayName     object
dtype: object

#### 5. Rename Id column to postId and OwnerUserId to userId

In [215]:
posts.rename(columns= {'Id': 'postId', 'OwnerUserId': 'userId'}, inplace= True)

#### 6. Define new dataframes for users and posts with the following selected columns:
    **users columns**: userId, Reputation,Views,UpVotes,DownVotes
    **posts columns**: postId, Score,userId,ViewCount,CommentCount

In [216]:
users = users[['userId', 'Reputation', 'Views', 'UpVotes', 'DownVotes']]
posts = posts[['postId', 'Score', 'userId', 'ViewCount', 'CommentCount']]

#### 7. Merge both dataframes, users and posts. 
You will need to make a [merge](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.merge.html) of posts and users dataframes.

In [217]:
users.dtypes

userId        int64
Reputation    int64
Views         int64
UpVotes       int64
DownVotes     int64
dtype: object

In [218]:
users_posts = users.merge(posts, on='userId')
users_posts.shape

(38962, 9)

#### 8. How many missing values do you have in your merged dataframe? On which columns?

In [219]:
print(users_posts.isnull().sum())
print(posts.columns)
# we are only missing a large number of values (23k) on the ViewCount column originating from the posts dataframe.

userId              0
Reputation          0
Views               0
UpVotes             0
DownVotes           0
postId              0
Score               0
ViewCount       23572
CommentCount        0
dtype: int64
Index(['postId', 'Score', 'userId', 'ViewCount', 'CommentCount'], dtype='object')


#### 9. You will need to make something with missing values.  Will you clean or filling them? Explain. 
**Remember** to check the results of your code before passing to the next step

In [220]:
# one possiblity is that nans are a result of posts with zero views. 
print(users_posts['ViewCount'].value_counts())
print(users_posts['ViewCount'][users_posts['ViewCount'] == 98])
print(users_posts['ViewCount'][users_posts['ViewCount'] == 0])

# in fact we see that there are a lost of entries with 98 ViewCount, but zero entries with 0 ViewCounts
# because I find it is highly unlikely that there are no posts with zero views I think it is safe to assume
# NaNs are zeros, so I will replace them
users_posts['ViewCount'].fillna(0, inplace=True)
print(users_posts['ViewCount'].value_counts())

98.0      43
150.0     43
122.0     42
156.0     41
108.0     41
          ..
4414.0     1
9772.0     1
1796.0     1
4976.0     1
2174.0     1
Name: ViewCount, Length: 3402, dtype: int64
3783     98.0
4962     98.0
9146     98.0
10242    98.0
11610    98.0
12399    98.0
18635    98.0
19165    98.0
19188    98.0
20384    98.0
20553    98.0
21331    98.0
21787    98.0
22577    98.0
22861    98.0
23103    98.0
23280    98.0
23678    98.0
23711    98.0
23872    98.0
25400    98.0
26363    98.0
27079    98.0
27203    98.0
27752    98.0
27846    98.0
30138    98.0
30508    98.0
30529    98.0
30991    98.0
32295    98.0
32781    98.0
32816    98.0
33329    98.0
34078    98.0
34574    98.0
35873    98.0
36613    98.0
36631    98.0
36688    98.0
37074    98.0
37786    98.0
38072    98.0
Name: ViewCount, dtype: float64
Series([], Name: ViewCount, dtype: float64)
0.0       23572
150.0        43
98.0         43
122.0        42
156.0        41
          ...  
4414.0        1
9772.0        1
1796.0 

#### 10. Adjust the data types in order to avoid future issues. Which ones should be changed? 

In [221]:
users_posts = users_posts.astype({'ViewCount': 'int64'})
users_posts.dtypes

userId          int64
Reputation      int64
Views           int64
UpVotes         int64
DownVotes       int64
postId          int64
Score           int64
ViewCount       int64
CommentCount    int64
dtype: object

# Detecting outliers
- Any z-score greater than 3 or less than -3 is considered to be an outlier. 
- This rule of thumb is based on the empirical rule. 
- From this rule we see that almost all of the data (99.7%) should be within three standard deviations from the mean.

In [224]:
detect_outliers = users_posts[['Reputation', 'Views', 'UpVotes', 'DownVotes', 'Score', 'ViewCount', 'CommentCount']]
outliers = detect_outliers[(np.abs(stats.zscore(detect_outliers)) > 3).any(axis=1)]

Unnamed: 0,Reputation,Views,UpVotes,DownVotes,Score,ViewCount,CommentCount
0,1,0,5007,1920,0,0,0
1,1,0,5007,1920,0,0,0
2,1,0,5007,1920,0,0,0
3,1,0,5007,1920,0,0,0
4,1,0,5007,1920,0,0,0
...,...,...,...,...,...,...,...
38889,477,46,96,1,4,7773,3
38890,477,46,96,1,11,15780,6
38903,408,27,0,0,47,0,1
38950,384,77,18,0,47,4463,11
