In [1]:
import utils
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import Imputer

from transformer import DiscountConverter

## Acquire data

In [2]:
train = utils.load_data('ccf_offline_stage1_train.csv')
test = utils.load_data('ccf_offline_stage1_test_revised.csv')

In [3]:
train.head(10)

Unnamed: 0,User_id,Merchant_id,Coupon_id,Discount_rate,Distance,Date_received,Date
0,1439408,2632,,,0.0,,20160217.0
1,1439408,4663,11002.0,150:20,1.0,20160528.0,
2,1439408,2632,8591.0,20:1,0.0,20160217.0,
3,1439408,2632,1078.0,20:1,0.0,20160319.0,
4,1439408,2632,8591.0,20:1,0.0,20160613.0,
5,1439408,2632,,,0.0,,20160516.0
6,1439408,2632,8591.0,20:1,0.0,20160516.0,20160613.0
7,1832624,3381,7610.0,200:20,0.0,20160429.0,
8,2029232,3381,11951.0,200:20,1.0,20160129.0,
9,2029232,450,1532.0,30:5,0.0,20160530.0,


In [4]:
train['Date_received'] =  pd.to_datetime(train['Date_received'], format='%Y%m%d')
train['Date'] =  pd.to_datetime(train['Date'], format='%Y%m%d')

## Make labels

In [5]:
train['label'] = 1
train.loc[train['Date'].isnull() | train['Coupon_id'].isnull(), 'label'] = 0
train.loc[(train['Date'] - train['Date_received']).dt.days > 15] = 0

In [6]:
train.loc[train['label'] == 1].head(5)

Unnamed: 0,User_id,Merchant_id,Coupon_id,Discount_rate,Distance,Date_received,Date,label
33,1113008,1361,11166.0,20:1,0.0,2016-05-15 00:00:00,2016-05-21 00:00:00,1
38,2881376,8390,7531.0,20:5,0.0,2016-03-21 00:00:00,2016-03-29 00:00:00,1
69,114747,6901,2366.0,30:5,0.0,2016-05-23 00:00:00,2016-06-05 00:00:00,1
76,114747,5341,111.0,30:5,0.0,2016-02-07 00:00:00,2016-02-18 00:00:00,1
77,114747,5341,7751.0,50:10,0.0,2016-01-27 00:00:00,2016-01-28 00:00:00,1


## Analyze by describing data

**Which features are available in the dataset?**

In [7]:
print(train.columns.values)

['User_id' 'Merchant_id' 'Coupon_id' 'Discount_rate' 'Distance'
 'Date_received' 'Date' 'label']


**Which features are categorical**

Discount_rate, Distance, label

**Which features are numerical?**

User_id, Merchant_id, Coupon_id, Date_received, Date

**What are the data types for various features?**

    * 

In [8]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1754884 entries, 0 to 1754883
Data columns (total 8 columns):
User_id          int64
Merchant_id      int64
Coupon_id        float64
Discount_rate    object
Distance         float64
Date_received    object
Date             object
label            int64
dtypes: float64(2), int64(3), object(3)
memory usage: 107.1+ MB


In [9]:
train.describe()

Unnamed: 0,User_id,Merchant_id,Coupon_id,Distance,label
count,1754884.0,1754884.0,1053282.0,1649763.0,1754884.0
mean,3666118.0,4013.966,6743.598,2.353036,0.03669473
std,2136665.0,2449.801,4209.665,3.481937,0.1880113
min,0.0,0.0,0.0,0.0,0.0
25%,1810540.0,1945.0,2840.0,0.0,0.0
50%,3671577.0,3532.0,7379.0,0.0,0.0
75%,5518551.0,6284.0,10182.0,3.0,0.0
max,7361032.0,8856.0,14045.0,10.0,1.0


In [10]:
train.describe(include=['O'])

Unnamed: 0,Discount_rate,Date_received,Date
count,1053282,1053282,776984
unique,46,168,183
top,30:5,2016-01-29 00:00:00,0
freq,267748,70932,10987


In [11]:
train['Discount_rate'].value_counts()

30:5       267748
100:10     180908
200:20     110507
20:5        90338
20:1        50366
50:5        46963
100:30      37846
200:30      29254
300:30      28829
50:10       28163
10:5        25453
0.95        20568
150:20      17369
10:1        17301
30:1        17108
100:20      13976
30:10       12632
0           10987
50:20        8089
0.9          8085
200:50       5546
150:10       5295
100:5        4992
0.8          3440
50:1         3278
5:1          2448
100:50       1703
0.85          649
150:30        646
200:10        566
100:1         527
20:10         480
150:50        305
300:50        204
0.5           186
0.75          121
0.2           110
0.6            58
300:20         56
200:5          54
0.7            54
30:20          24
300:10         23
200:100        11
50:30           9
150:5           7
Name: Discount_rate, dtype: int64

In [12]:
discount_converter = DiscountConverter()
train = discount_converter.fit_transform(train)

In [14]:
train.head(5)

Unnamed: 0,User_id,Merchant_id,Coupon_id,Discount_rate,Distance,Date_received,Date,label
0,1439408,2632,,,0.0,NaT,2016-02-17 00:00:00,0
1,1439408,4663,11002.0,0.866667,1.0,2016-05-28 00:00:00,NaT,0
2,1439408,2632,8591.0,0.95,0.0,2016-02-17 00:00:00,NaT,0
3,1439408,2632,1078.0,0.95,0.0,2016-03-19 00:00:00,NaT,0
4,1439408,2632,8591.0,0.95,0.0,2016-06-13 00:00:00,NaT,0


### Categorize the most relevent features.

In [None]:
# Divide by 1.5 to limit the number of income categories
data["income_cat"] = np.ceil(data["median_income"] / 1.5)
# Label those above 5 as 5
data["income_cat"].where(data["income_cat"] < 5, 5.0, inplace=True)
data["income_cat"].value_counts()

In [None]:
train.hist(figsize=(18, 15))

In [None]:
g = sns.FacetGrid(train, col='label', size=8, aspect=1, sharey=False)
g.map(plt.hist, 'Distance', bins=10)

In [None]:
g = sns.FacetGrid(train, col='label', size=4, aspect=1, sharey=False)
g.map(plt.hist, 'Merchant_id', bins=10)

In [None]:
g = sns.FacetGrid(train, col='label', size=8, aspect=1, sharey=False)
g.map(plt.hist, 'User_id', bins=10)

In [None]:
g = sns.FacetGrid(train, col='label', size=8, aspect=1, sharey=False)
g.map(plt.hist, 'Coupon_id', bins=10)

## Prepare the data for machine learning algorithms

In [None]:
X = train.drop(["label", "Date_received", "Date"], axis=1)
y = train["label"].copy()

In [None]:
sample_incomplete_rows = X[X.isnull().any(axis=1)].head()
sample_incomplete_rows

In [None]:
X['Discount_rate'].values.shape

In [None]:
X['Discount_rate'] = X['Discount_rate'].apply(lambda x: [1, 2])

In [None]:
X

In [None]:
train.tail(100)