## Quiz 1: Understanding the Dataset

In [1]:
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
import os

%matplotlib inline

random.seed(42)

In [2]:
ab_data_file = os.path.join('./data', 'ab_data.csv')

df = pd.read_csv(ab_data_file)

df.head()

Unnamed: 0,user_id,timestamp,group,landing_page,converted
0,851104,2017-01-21 22:11:48.556739,control,old_page,0
1,804228,2017-01-12 08:01:45.159739,control,old_page,0
2,661590,2017-01-11 16:55:06.154213,treatment,new_page,0
3,853541,2017-01-08 18:28:03.143765,treatment,new_page,0
4,864975,2017-01-21 01:52:26.210827,control,old_page,1


In [3]:
'''
The number of rows in the dataset
'''

df.shape[0]

294478

In [4]:
'''
The number of unique users in the dataset
'''

df['user_id'].unique().shape[0]

290584

In [5]:
'''
The proportion of users converted
'''

df['converted'].mean()

0.11965919355605512

In [6]:
'''
The number of times the new_page and treatment don't match.
'''

gk = df.groupby('group')

control_group = gk.get_group('control')
treatment_group = gk.get_group('treatment')

num_new_page_n_control =  control_group[control_group['landing_page'] == 'new_page'].shape[0]
num_old_page_n_treatment =  treatment_group[treatment_group['landing_page'] == 'old_page'].shape[0]

num_new_page_n_control + num_old_page_n_treatment

3893

In [7]:
'''
Do any of the rows have missing values?
'''

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 294478 entries, 0 to 294477
Data columns (total 5 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   user_id       294478 non-null  int64 
 1   timestamp     294478 non-null  object
 2   group         294478 non-null  object
 3   landing_page  294478 non-null  object
 4   converted     294478 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 11.2+ MB


**Answer**: There is no missing values

---
## Quiz 2: Messy Data

In [8]:
'''
Drop rows satisfying the following conditions
- new_page & control
- old_page & treatment
'''

df2 = df.drop(control_group[control_group['landing_page'] == 'new_page'].index)

df2.drop(treatment_group[treatment_group['landing_page'] == 'old_page'].index, inplace=True)

In [9]:
# Double Check all of the correct rows were removed - this should be 0
df2[((df2['group'] == 'treatment') == (df2['landing_page'] == 'new_page')) == False].shape[0]

0

---
## Quiz 3: Updated Dataframe

In [10]:
'''
The number of unique users in df2
'''

df2['user_id'].unique().shape[0]

290584

In [11]:
'''
Row of non unique id
'''

duplicated_user = df2[df2['user_id'].duplicated()]

duplicated_user

Unnamed: 0,user_id,timestamp,group,landing_page,converted
2893,773192,2017-01-14 02:55:59.590927,treatment,new_page,0


In [12]:
df2.drop(duplicated_user.index, inplace=True)

---
## Quiz 4: Probability

In [13]:
'''
Probability of converting regardless of page
'''

df2['converted'].mean()

0.11959708724499628

In [14]:
'''
Given that an individual was in the control group, the probability of converting.
'''

df2.query("group == 'control'")['converted'].mean()

0.1203863045004612

In [15]:
'''
Given that an individual was in the treatment group, the probability of converting.
'''

df2.query("group == 'treatment'")['converted'].mean()

0.11880806551510564

In [16]:
'''
The probability of receiving the new page.
'''

df2_gk = df2.groupby('landing_page')

df2_gk.get_group('new_page').shape[0] / df2.shape[0]

0.5000619442226688