In [6]:
import pandas as pd

In [13]:
rows = []
user_clicks = []
with open("train.txt") as f:
    for line in f:
        parts = line.split()
        if parts[0] == "SHOW":
            rows.append(parts[1:])
        elif parts[0] == "CLICK":
            user_clicks.append(parts[1])

In [18]:
df = pd.DataFrame(rows, columns=["user_id", "zipcode", "ad"])
df["click"] = df["user_id"].isin(user_clicks)
df.head()

Unnamed: 0,user_id,zipcode,ad,click
0,0,54750,wine,False
1,1,54812,wine,False
2,2,54942,coffee,True
3,3,53218,coffee,False
4,4,53213,wine,False


In [20]:
incomes = pd.read_csv("income.csv").set_index("zip")
incomes.head()

Unnamed: 0_level_0,income-mean,income-median
zip,Unnamed: 1_level_1,Unnamed: 2_level_1
53001,94015,72578
53002,86643,77708
53003,71386,47292
53004,90837,81250
53005,138901,104534


In [30]:
features = incomes.loc[df["zipcode"].astype(int)].reset_index(drop=True)
features.head()

Unnamed: 0,income-mean,income-median
0,83863,67614
1,63414,49958
2,107470,102383
3,51411,38584
4,107369,80139


In [32]:
train = pd.concat((df, features), axis=1)
train.head()

Unnamed: 0,user_id,zipcode,ad,click,income-mean,income-median
0,0,54750,wine,False,83863,67614
1,1,54812,wine,False,63414,49958
2,2,54942,coffee,True,107470,102383
3,3,53218,coffee,False,51411,38584
4,4,53213,wine,False,107369,80139


# Part 2

In [46]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import LogisticRegression

In [51]:
xcols = ["income-mean", "income-median"]
ycol = "click"
models = {} # key: ad name; val: sklearn model
for ad_name in set(train["ad"]):
    train_subset = train[train["ad"] == ad_name]

    model = Pipeline([
        ("poly", PolynomialFeatures(3)),
        ("std", StandardScaler()),
        ("lr", LogisticRegression()),
    ])

    model.fit(train_subset[xcols], train_subset[ycol])
    models[ad_name] = model
models

{'coffee': Pipeline(memory=None,
          steps=[('poly',
                  PolynomialFeatures(degree=3, include_bias=True,
                                     interaction_only=False, order='C')),
                 ('std',
                  StandardScaler(copy=True, with_mean=True, with_std=True)),
                 ('lr',
                  LogisticRegression(C=1.0, class_weight=None, dual=False,
                                     fit_intercept=True, intercept_scaling=1,
                                     l1_ratio=None, max_iter=100,
                                     multi_class='auto', n_jobs=None,
                                     penalty='l2', random_state=None,
                                     solver='lbfgs', tol=0.0001, verbose=0,
                                     warm_start=False))],
          verbose=False), 'wine': Pipeline(memory=None,
          steps=[('poly',
                  PolynomialFeatures(degree=3, include_bias=True,
                                  

In [68]:
def click_prob(zipcode, ad_name):
    return models[ad_name].predict_proba(incomes.loc[[zipcode]])[0,1]

def get_best_ad(zipcode):
    best_prob = 0
    for ad_name in ["soda", "coffee", "wine"]:
        prob = click_prob(zipcode, ad_name)
        if prob > best_prob:
            best_prob = prob
            best_ad = ad_name
    return best_ad
        
print(click_prob(53001, "soda"))
print(click_prob(53001, "coffee"))
print(click_prob(53001, "wine"))
get_best_ad(53006)

0.12156257575904296
0.2392969941359968
0.2442832729986621


'coffee'

In [70]:
import pickle

In [73]:
with open("models.pkl", "wb") as f:
    pickle.dump(models, f)