# Random Sampling

In [1]:
import pandas as pd
import numpy as np
from scipy import stats

In [2]:
# read in data frame 

df = pd.read_csv("~/Desktop/weclouddata/data/experimental_data.csv")
df.head()

Unnamed: 0,user_id,city,tenure,channel,opu_6_week,click,experiment
0,31c0fadd6066ab12e90fa931abbb649f7f591cd7927b9b...,STL,35,14,0.5,0,0
1,458c88e94be0dba675b65ea063233ea55806bf75c920a7...,EDM,4,15,0.5,0,1
2,62a556505ca62b9d00ea39e01885649c84ee8167c08bc4...,CAL,25,15,0.67,0,1
3,cf6ebf0bd83ea0c3d12526cd4952cd778c57abb0923f0b...,MSY,68,14,0.83,0,0
4,62d891d1d4a359eeee374346a57f943e8554193870a2c1...,SAN,17,2,0.33,0,1


## Notes

There are several ways of sampling. I used the simplest one here, fully randomization.

Question: 

a. What are the potential drawbacks of this approach?             
b. How to improve?

In [3]:
# fully randomized experiment

randomized_exp = df.sample(frac = 0.1, replace=True, random_state=82)

# t-test

To test potentially normally distributed variables' mean difference, we may appply t-test. So, please run some EDA and check if the variable follow normal distribution. 

In [4]:
experimental_group = df.loc[df['experiment'] == 1]
experimental_group_opu = np.array(experimental_group["opu_6_week"])

control_group = df.loc[df['experiment'] == 0]
control_group_opu = np.array(control_group["opu_6_week"])

In [6]:
stats.ttest_ind(experimental_group_opu, control_group_opu, equal_var = False)

Ttest_indResult(statistic=-0.3242607123161102, pvalue=0.7457469910445229)

# Logistic Regression

In order to test binomial distribution variables, we cannot use t-test again. And to illustrate my idea more, I used logistic regression here. 

In [7]:
# the column click is the target variable
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.metrics import classification_report

from sklearn.model_selection import train_test_split

In [8]:
X = pd.get_dummies(df.loc[:, ["city", "tenure","channel", "experiment"]])
y = pd.get_dummies(df.loc[:, "click"], drop_first=True)

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

In [10]:
clf = LogisticRegression(random_state=0, solver='lbfgs',
                          multi_class='multinomial').fit(X_train, y_train)
#clf.predict(X[:2, :])

  y = column_or_1d(y, warn=True)


In [11]:
clf

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='multinomial',
          n_jobs=None, penalty='l2', random_state=0, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

In [12]:
y_pred = clf.predict(X_test)

In [13]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.97      1.00      0.99      3894
           1       0.00      0.00      0.00       106

   micro avg       0.97      0.97      0.97      4000
   macro avg       0.49      0.50      0.49      4000
weighted avg       0.95      0.97      0.96      4000



  'precision', 'predicted', average, warn_for)


In [14]:
print('intercept:', clf.intercept_)
print('coefficient:', clf.coef_[0])

intercept: [-1.68258737]
coefficient: [-2.35228880e-04  1.76894952e-03  1.10775502e-02  8.87954621e-02
 -1.55259188e-01  2.05023987e-02 -5.86758888e-02  3.18844384e-03
 -1.79213273e-02 -1.98717910e-01  1.09393524e-01 -1.68414639e-01
  8.57295569e-02  1.19569818e-01 -9.29369641e-02  1.26739116e-01
 -2.96836005e-02  6.82361859e-02 -1.92344559e-01 -4.07996066e-01
 -5.02032071e-02 -2.55015750e-01 -1.53729309e-01 -4.67042596e-02
 -3.03488092e-01 -1.29912258e-01  2.25112103e-01 -5.89109973e-03
 -1.63060588e-01  1.36109019e-01  2.11711641e-02  1.50990353e-02
 -6.35861020e-02]
