# Data Load

In [1]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

train_bookmark = pd.read_csv('Train/train_bookmark.csv', parse_dates=['dates'], infer_datetime_format=True)
train_service = pd.read_csv('Train/train_service.csv', parse_dates=['registerdate','enddate'], infer_datetime_format=True)
coin = pd.read_csv('Code/coin.csv')
content_info = pd.read_csv('Code/content_info.csv')
movie_info = pd.read_csv('Code/movie_info.csv')
predict_bookmark = pd.read_csv('Predict/predict_bookmark.csv', parse_dates=['dates'], infer_datetime_format=True)
predict_service = pd.read_csv('Predict/predict_service.csv', parse_dates=['registerdate','enddate'], infer_datetime_format=True)

# 전처리, 결측치 처리

In [2]:
exchange_rate = 1138.5

train_service['promo_100'] = train_service['promo_100'].apply(lambda x: 0 if pd.isna(x) else 1 )
train_service['coinReceived'] = train_service['coinReceived'].apply(lambda x: 0 if pd.isna(x) else 1 )
train_service['isauth'] = train_service['isauth'].apply(lambda x: 0 if pd.isna(x) else 1 )
train_service['Repurchase'] = train_service['Repurchase'].apply(lambda x: 1 if x=='X' else 0)
train_service['gender'] = train_service['gender'].fillna('N')
train_service['pgamount'] = train_service['pgamount'].apply(lambda x: x * exchange_rate if x<100 else x)

predict_service['promo_100'] = predict_service['promo_100'].apply(lambda x: 0 if pd.isna(x) else 1 )
predict_service['coinReceived'] = predict_service['coinReceived'].apply(lambda x: 0 if pd.isna(x) else 1 )
predict_service['isauth'] = predict_service['isauth'].apply(lambda x: 0 if pd.isna(x) else 1 )
# predict_service['Repurchase'] = predict_service['Repurchase'].apply(lambda x: int(1) if x=='X' else int(np.nan))
predict_service['gender'] = predict_service['gender'].fillna('N')
predict_service['pgamount'] = predict_service['pgamount'].apply(lambda x: x * exchange_rate if x<100 else x)

In [None]:
train_service[train_service.enddate >= train_service.registerdate + pd.DateOffset(weeks=3)]

In [None]:
predict_service[predict_service.enddate >= predict_service.registerdate + pd.DateOffset(weeks=3)]

# EDA

In [None]:
# train_service.groupby('uno')['registerdate'].count().sort_values(ascending=False)
train_service[train_service['uno'] == 'a02a14ff1ab86edbfe46ad5a6a7fce054dc83c39aa7c362622171c550dcfe7099733a95d986e379eedfcb8edb5adc79c3f2439a56104f63410b83a10131c8ea5'].sort_values(by='registerdate')

In [None]:
train_bookmark[train_bookmark['uno'] == 'a02a14ff1ab86edbfe46ad5a6a7fce054dc83c39aa7c362622171c550dcfe7099733a95d986e379eedfcb8edb5adc79c3f2439a56104f63410b83a10131c8ea5'].sort_values(by=['dates', 'hour'])

In [None]:
train_service[train_service['registerdate'].isna()]['uno']
train_service.loc[15219]['uno']

In [None]:
train_service['promo_100'].unique()

In [None]:
predict_service.head()

In [None]:
coin.head()

In [None]:
content_info.head()

In [None]:
movie_info.head()

# Feature 추가

## View Info

In [None]:
import numpy as np
from datetime import timedelta

# pd.to_datetime((train_service.loc[0, 'registerdate'].date() + timedelta(days=1)))

In [3]:
train_bookmark_grb_tot = pd.DataFrame(train_bookmark.groupby(['uno'])['dates'].count())
train_bookmark_grb_tot.rename(columns={'dates':'tot_view_count'}, inplace=True)
train_bookmark_grb_tot.reset_index(drop=False, inplace=True)


train_bookmark_grb_tot = pd.merge(train_bookmark_grb_tot, pd.DataFrame(train_bookmark.groupby(['uno'])['viewtime'].sum()), on=['uno'])
train_bookmark_grb_tot.rename(columns={'viewtime':'tot_viewtime_sum'}, inplace=True)

train_bookmark_grb_tot = pd.merge(train_bookmark_grb_tot, pd.DataFrame(train_bookmark.groupby(['uno'])['viewtime'].mean()), on=['uno'])
train_bookmark_grb_tot.rename(columns={'viewtime':'tot_viewtime_mean'}, inplace=True)

train_bookmark_grb_tot = pd.merge(train_bookmark_grb_tot, pd.DataFrame(train_bookmark.groupby(['uno'])['viewtime'].std()), on=['uno'])
train_bookmark_grb_tot.rename(columns={'viewtime':'tot_viewtime_std'}, inplace=True)

train_bookmark_grb_tot

Unnamed: 0,uno,tot_view_count,tot_viewtime_sum,tot_viewtime_mean,tot_viewtime_std
0,0000555c21e7942b8281c8068c2b5be0a628b8a1a3cbea...,1122,600960,535.614973,747.631661
1,0000660c47ef815351301bf15ec9bccb8deeb10083e3c8...,61,69000,1131.147541,981.838579
2,00012d296d8a780358116414b0dcb64b74dfebf3200cbe...,1,230,230.000000,
3,00016a195056fe7ff99b890eb5ad7cf83617498b6fbdbc...,140,119110,850.785714,674.277730
4,0001d4be392f82bf5b8b912418c7981f2507341c8cbe17...,106,115300,1087.735849,694.018365
...,...,...,...,...,...
67301,fffced43deaf4457c319b20066d48e7448366892bbbb9b...,54,57630,1067.222222,945.967443
67302,fffe8c58d0b918895b55bd7907f5f510fbe11c4b8f6515...,275,425220,1546.254545,1056.796292
67303,ffff0d42a9cb0c7ba43546929d3c0dc0b141a2ec00fd59...,302,434350,1438.245033,1100.996785
67304,ffff49a60e9440a42fbfb9592aa4dfae10e9493d941b85...,128,121930,952.578125,818.995204


In [4]:
train_service = pd.merge(train_service, train_bookmark_grb_tot, on='uno', how='left')
train_service['tot_view_count'].fillna(0, inplace=True)
train_service['tot_viewtime_sum'].fillna(0, inplace=True)
train_service['tot_viewtime_mean'].fillna(0, inplace=True)
train_service['tot_viewtime_std'].fillna(0, inplace=True)

In [5]:
predict_bookmark_grb_tot = pd.DataFrame(predict_bookmark.groupby(['uno'])['dates'].count())
predict_bookmark_grb_tot.rename(columns={'dates':'tot_view_count'}, inplace=True)
predict_bookmark_grb_tot.reset_index(drop=False, inplace=True)


predict_bookmark_grb_tot = pd.merge(predict_bookmark_grb_tot, pd.DataFrame(predict_bookmark.groupby(['uno'])['viewtime'].sum()), on=['uno'])
predict_bookmark_grb_tot.rename(columns={'viewtime':'tot_viewtime_sum'}, inplace=True)

predict_bookmark_grb_tot = pd.merge(predict_bookmark_grb_tot, pd.DataFrame(predict_bookmark.groupby(['uno'])['viewtime'].mean()), on=['uno'])
predict_bookmark_grb_tot.rename(columns={'viewtime':'tot_viewtime_mean'}, inplace=True)

predict_bookmark_grb_tot = pd.merge(predict_bookmark_grb_tot, pd.DataFrame(predict_bookmark.groupby(['uno'])['viewtime'].std()), on=['uno'])
predict_bookmark_grb_tot.rename(columns={'viewtime':'tot_viewtime_std'}, inplace=True)

predict_bookmark_grb_tot

Unnamed: 0,uno,tot_view_count,tot_viewtime_sum,tot_viewtime_mean,tot_viewtime_std
0,00005933bf5b80f52ee0981778d9be8c410bb662d2e78c...,111,280130,2523.693694,2121.187030
1,0002d606e81dfbf6445cf96f9d54f4577dda8361ffe2b1...,481,643140,1337.089397,1003.673799
2,00100c30414d08d69679faa672754c92965b81a2cd4b13...,368,474550,1289.538043,851.818413
3,0019156e67468401eff62e992d581d721c2f2d68d78dc6...,67,79270,1183.134328,965.409646
4,0019ebcf13ea62a20b0e6626103f4d2164e61c64355b9d...,31,41250,1330.645161,1159.804396
...,...,...,...,...,...
18306,ffe4649641ec7255718ea6498c1228971486b07c08495a...,424,537680,1268.113208,855.380538
18307,ffe61ff67fdfdbd6dc26ce0dbbd0be4b8a6d400bd8f47e...,162,271850,1678.086420,1230.284790
18308,ffef5515f5136b7db04522aa06927601eee80d6e33aff4...,5,4530,906.000000,511.888660
18309,fff7166dc78cbafa2d528b60c8ad3a02de0c801d67671f...,206,78280,380.000000,489.201440


In [6]:
predict_service = pd.merge(predict_service, predict_bookmark_grb_tot, on='uno', how='left')
predict_service['tot_view_count'].fillna(0, inplace=True)
predict_service['tot_viewtime_sum'].fillna(0, inplace=True)
predict_service['tot_viewtime_mean'].fillna(0, inplace=True)
predict_service['tot_viewtime_std'].fillna(0, inplace=True)

In [None]:
predict_service.head()

In [7]:
train_bookmark_grb = pd.DataFrame(train_bookmark.groupby(['uno', 'dates'])['dates'].count())
train_bookmark_grb.rename(columns={'dates':'view_count'}, inplace=True)
train_bookmark_grb.reset_index(drop=False, inplace=True)


train_bookmark_grb = pd.merge(train_bookmark_grb, pd.DataFrame(train_bookmark.groupby(['uno', 'dates'])['viewtime'].sum()), on=['uno', 'dates'])
train_bookmark_grb.rename(columns={'viewtime':'viewtime_sum'}, inplace=True)

train_bookmark_grb = pd.merge(train_bookmark_grb, pd.DataFrame(train_bookmark.groupby(['uno', 'dates'])['viewtime'].mean()), on=['uno', 'dates'])
train_bookmark_grb.rename(columns={'viewtime':'viewtime_mean'}, inplace=True)

train_bookmark_grb = pd.merge(train_bookmark_grb, pd.DataFrame(train_bookmark.groupby(['uno', 'dates'])['viewtime'].std()), on=['uno', 'dates'])
train_bookmark_grb.rename(columns={'viewtime':'viewtime_std'}, inplace=True)

import sqlite3

conn = sqlite3.connect(':memory:')

train_service.to_sql('train_service', conn, index=False)
train_bookmark_grb.to_sql('train_bookmark_grb', conn, index=False)

query = '''
select train_service.uno,
       registerdate, 
       enddate, 
       sum(view_count) view_count, 
       sum(viewtime_sum) viewtime_sum, 
       sum(viewtime_mean) viewtime_mean, 
       sum(viewtime_std) viewtime_std
from train_service
left join train_bookmark_grb
on train_bookmark_grb.dates >= registerdate and train_bookmark_grb.dates <= enddate
and train_service.uno = train_bookmark_grb.uno 
group by train_service.uno, registerdate, enddate
'''
sqlDf = pd.read_sql_query(query, conn)
sqlDf['registerdate'] = pd.to_datetime(sqlDf['registerdate'])
sqlDf['enddate'] = pd.to_datetime(sqlDf['enddate'])
train_service = pd.merge(train_service, sqlDf, on=['uno', 'registerdate', 'enddate'])

In [8]:
predict_bookmark_grp = pd.DataFrame(predict_bookmark.groupby(['uno', 'dates'])['dates'].count())
predict_bookmark_grp.rename(columns={'dates':'view_count'}, inplace=True)
predict_bookmark_grp.reset_index(drop=False, inplace=True)


predict_bookmark_grp = pd.merge(predict_bookmark_grp, pd.DataFrame(predict_bookmark.groupby(['uno', 'dates'])['viewtime'].sum()), on=['uno', 'dates'])
predict_bookmark_grp.rename(columns={'viewtime':'viewtime_sum'}, inplace=True)

predict_bookmark_grp = pd.merge(predict_bookmark_grp, pd.DataFrame(predict_bookmark.groupby(['uno', 'dates'])['viewtime'].mean()), on=['uno', 'dates'])
predict_bookmark_grp.rename(columns={'viewtime':'viewtime_mean'}, inplace=True)

predict_bookmark_grp = pd.merge(predict_bookmark_grp, pd.DataFrame(predict_bookmark.groupby(['uno', 'dates'])['viewtime'].std()), on=['uno', 'dates'])
predict_bookmark_grp.rename(columns={'viewtime':'viewtime_std'}, inplace=True)

import sqlite3

conn = sqlite3.connect(':memory:')

predict_service.to_sql('predict_service', conn, index=False)
predict_bookmark_grp.to_sql('predict_bookmark_grp', conn, index=False)

query = '''
select predict_service.uno,
       registerdate, 
       enddate, 
       sum(view_count) view_count, 
       sum(viewtime_sum) viewtime_sum, 
       sum(viewtime_mean) viewtime_mean, 
       sum(viewtime_std) viewtime_std
from predict_service
left join predict_bookmark_grp
on predict_bookmark_grp.dates >= registerdate and predict_bookmark_grp.dates <= enddate
and predict_service.uno = predict_bookmark_grp.uno 
group by predict_service.uno, registerdate, enddate
'''
sqlDf = pd.read_sql_query(query, conn)
sqlDf['registerdate'] = pd.to_datetime(sqlDf['registerdate'])
sqlDf['enddate'] = pd.to_datetime(sqlDf['enddate'])
predict_service = pd.merge(predict_service, sqlDf, on=['uno', 'registerdate', 'enddate'])

In [None]:
predict_service.head()

## Productcode별 해지율

In [9]:
rateByproductcode = pd.DataFrame(train_service.groupby(['productcode', 'Repurchase'])['Repurchase'].count()) / pd.DataFrame(train_service.groupby(['productcode'])['Repurchase'].count())
rateByproductcode.columns = ['ChurnRateByProductcode']
rateByproductcode.reset_index(inplace=True)

train_service = pd.merge(train_service, 
                 rateByproductcode[rateByproductcode['Repurchase'] == 1][['productcode', 'ChurnRateByProductcode']], 
                 on='productcode', how='left')

predict_service = pd.merge(predict_service, 
                 rateByproductcode[rateByproductcode['Repurchase'] == 1][['productcode', 'ChurnRateByProductcode']], 
                 on='productcode', how='left')
predict_service.head()

Unnamed: 0,uno,registerdate,enddate,productcode,pgamount,chargetypeid,concurrentwatchcount,promo_100,coinReceived,Repurchase,...,agegroup,tot_view_count,tot_viewtime_sum,tot_viewtime_mean,tot_viewtime_std,view_count,viewtime_sum,viewtime_mean,viewtime_std,ChurnRateByProductcode
0,9c1c04380d3ec71c9ea55cb99ad803ab7c0037a3482b9b...,2021-03-14 16:44:57,2021-04-14 16:44:57,pk_1487,100.0,190,1,1,0,,...,20,87.0,96890.0,1113.678161,948.960404,87.0,96890.0,15847.277778,11693.723742,0.319453
1,b725d844efdb214963a6ccae004778d0fe40f8a0b5e901...,2021-03-14 21:33:10,2021-04-14 21:33:10,pk_1488,100.0,134,2,1,0,,...,35,48.0,24240.0,505.0,488.684731,41.0,20970.0,5508.378788,3402.540033,0.213888
2,7de6c80c6cb5c5098bbfef8d9da75dfdd338b681d96691...,2021-03-14 10:55:38,2021-04-14 10:55:38,pk_2025,100.0,151,1,1,0,,...,40,13.0,9460.0,727.692308,1304.077924,,,,,0.293849
3,a5a209071166d24243bf47955ca91bb590a9d3bee46d71...,2021-03-14 00:19:21,2021-04-14 00:19:21,pk_1488,10900.0,190,2,0,0,,...,40,151.0,178570.0,1182.582781,1022.527238,147.0,173160.0,20966.731962,16397.213048,0.213888
4,7a6960912bebe03c6e4c770eb1aa91329c3497f18f90ca...,2021-03-14 20:08:31,2021-04-14 20:08:31,pk_1489,100.0,134,4,1,0,,...,20,36.0,27890.0,774.722222,645.660176,33.0,25890.0,7201.818182,4224.081604,0.439507


## chargetypeid별 해지율

In [10]:
rateByChargetypeid = pd.DataFrame(train_service.groupby(['chargetypeid', 'Repurchase'])['Repurchase'].count()) / pd.DataFrame(train_service.groupby(['chargetypeid'])['Repurchase'].count())
rateByChargetypeid.columns = ['ChurnRateByChargetypeid']
rateByChargetypeid.reset_index(inplace=True)

train_service = pd.merge(train_service, 
                 rateByChargetypeid[rateByChargetypeid['Repurchase'] == 1][['chargetypeid', 'ChurnRateByChargetypeid']], 
                 on='chargetypeid', how='left')
train_service['ChurnRateByChargetypeid'].fillna(0, inplace=True)

predict_service = pd.merge(predict_service, 
                 rateByChargetypeid[rateByChargetypeid['Repurchase'] == 1][['chargetypeid', 'ChurnRateByChargetypeid']], 
                 on='chargetypeid', how='left')
predict_service['ChurnRateByChargetypeid'].fillna(0, inplace=True)

In [None]:
rateByChargetypeid

## Device Count

In [11]:
train_service = pd.merge(train_service, 
                 train_bookmark.groupby('uno')['devicetype'].nunique(), 
                 on='uno', how='left')

train_service.rename(columns={'devicetype':'devicetype_cnt'}, inplace=True)

predict_service = pd.merge(predict_service, 
                 predict_bookmark.groupby('uno')['devicetype'].nunique(), 
                 on='uno', how='left')
predict_service.rename(columns={'devicetype':'devicetype_cnt'}, inplace=True)

## 고객별 가입이력

In [None]:
pd.DataFrame(train_service.groupby('uno')['registerdate'].count()).sort_values(by='registerdate', ascending=False)

In [None]:
train_service.groupby(['uno'])['Repurchase'].sum() / train_service.groupby(['uno'])['registerdate'].count()

In [None]:
# 가입횟수
df_join_info = pd.DataFrame()
df_join_info['join_count'] = train_service.groupby(['uno'])['registerdate'].count()

# 해지횟수
df_join_info['chrun_count'] = train_service.groupby(['uno'])['registerdate'].count() - train_service.groupby(['uno'])['Repurchase'].sum()

# 해지율
df_join_info['churn_rate'] = df_join_info['chrun_count'] / df_join_info['join_count']

In [None]:
df_join_info[(df_join_info['churn_rate'] > 0) & (df_join_info['churn_rate'] < 1)].sort_values(by='churn_rate')

In [None]:
train_service[train_service['uno'] == '9dbaa030c6b2e1617537475b85dd57d7d6b993fa2387b61db8d05dc57e86f650fb10c003859b12fa7f7adda43142cac7c8a8aa12713ce1ef498a37e9e05202b7']

In [None]:
train_service['rateByChargetypeid'].unique()

In [None]:
train_service[train_service['chargetypeid'] == 121]

## 시리즈물 시청 정보

In [None]:
train_bookmark.head()

In [None]:
content_info[content_info['contentid'].str.contains('S01_E454434902')]

In [None]:
movie_info.head()

# Feature Select

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

plt.figure(figsize=(18,10))
sns.heatmap(train_service.corr(), annot=True)   

In [None]:
train_columns = list(train_service.columns)
train_columns.remove('Repurchase')
train_columns

# Model

## KNN

In [None]:
train_service_target = train_service[train_service.enddate >= (train_service.registerdate + pd.DateOffset(weeks=3))]
predict_service_target = predict_service[predict_service.enddate >= (predict_service.registerdate + pd.DateOffset(weeks=3))]


In [None]:
for c in train_service[train_columns].select_dtypes(['float64', 'int64']).columns:
  train_service.fillna(0, inplace=True)

for c in predict_service[train_columns].select_dtypes(['float64', 'int64']).columns:
  predict_service.fillna(0, inplace=True)  

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MinMaxScaler

X_train, X_test, y_train, y_test = train_test_split(train_service[train_columns].select_dtypes(['float64', 'int64']), 
                                                    train_service['Repurchase'], 
                                                    test_size=0.2, 
                                                    stratify=train_service['Repurchase'])

scaler = MinMaxScaler()
X_train_norm = scaler.fit_transform(X_train)
X_test_norm = scaler.transform(X_test)

params = {
    'n_neighbors' : range(1, 21, 1)   
}
clf = KNeighborsClassifier()
grid_cv = GridSearchCV(clf, param_grid=params, scoring='f1', cv=5, verbose=1)
grid_cv.fit(X_train_norm, y_train)

# clf.fit(X_train, y_train)
pred = grid_cv.best_estimator_.predict(X_test_norm)
print(grid_cv.best_score_)
print(grid_cv.best_params_)
print(classification_report(y_test, pred))


In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MinMaxScaler

X_train, X_test, y_train, y_test = train_test_split(train_service_target[train_columns].select_dtypes(['float64', 'int64']), 
                                                    train_service_target['Repurchase'], 
                                                    test_size=0.2, 
                                                    stratify=train_service_target['Repurchase'])

scaler = MinMaxScaler()
X_train_norm = scaler.fit_transform(X_train)
X_test_norm = scaler.transform(X_test)

params = {
    'n_neighbors' : range(1, 21, 1)   
}
clf = KNeighborsClassifier()
grid_cv = GridSearchCV(clf, param_grid=params, scoring='f1', cv=5, verbose=1)
grid_cv.fit(X_train_norm, y_train)

# clf.fit(X_train, y_train)
pred = grid_cv.best_estimator_.predict(X_test_norm)
print(grid_cv.best_score_)
print(grid_cv.best_params_)
print(classification_report(y_test, pred))


## Decision Tree

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV

X_train, X_test, y_train, y_test = train_test_split(train_service[train_columns].select_dtypes(['float64', 'int64']), 
                                                    train_service['Repurchase'], 
                                                    test_size=0.2, 
                                                    stratify=train_service['Repurchase'])

params = {
    'max_depth' : range(1, 21, 2),
    'min_samples_split' : range(1, 11, 1)    
}
clf = DecisionTreeClassifier()
grid_cv = GridSearchCV(clf, param_grid=params, scoring='f1', cv=5, verbose=1)
grid_cv.fit(X_train, y_train)

# clf.fit(X_train, y_train)
pred = grid_cv.best_estimator_.predict(X_test)
print(grid_cv.best_score_)
print(grid_cv.best_params_)
print(classification_report(y_test, pred))

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV

X_train, X_test, y_train, y_test = train_test_split(train_service_target[train_columns].select_dtypes(['float64', 'int64']), 
                                                    train_service_target['Repurchase'], 
                                                    test_size=0.2, 
                                                    stratify=train_service_target['Repurchase'])

params = {
    'max_depth' : range(1, 21, 2),
    'min_samples_split' : range(1, 11, 1)    
}
clf = DecisionTreeClassifier()
grid_cv = GridSearchCV(clf, param_grid=params, scoring='f1', cv=5, verbose=1)
grid_cv.fit(X_train, y_train)

# clf.fit(X_train, y_train)
pred = grid_cv.best_estimator_.predict(X_test)
print(grid_cv.best_score_)
print(grid_cv.best_params_)
print(classification_report(y_test, pred))

In [None]:
pred_submission = clf.predict(predict_service[train_columns].select_dtypes(['float64', 'int64']))
predict_service['pred'] = pred_submission
predict_service.head()

## RandomForestClassifier

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

X_train, X_test, y_train, y_test = train_test_split(train_service[train_columns].select_dtypes(['float64', 'int64']), 
                                                    train_service['Repurchase'], 
                                                    test_size=0.2, 
                                                    stratify=train_service['Repurchase'])

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score

rcf = RandomForestClassifier()
rcf.fit(X_train, y_train)
pred = rcf.predict(X_test)
print(classification_report(y_test, pred))

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

X_train, X_test, y_train, y_test = train_test_split(train_service_target[train_columns].select_dtypes(['float64', 'int64']), 
                                                    train_service_target['Repurchase'], 
                                                    test_size=0.2, 
                                                    stratify=train_service_target['Repurchase'])

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score

rcf = RandomForestClassifier()
rcf.fit(X_train, y_train)
pred = rcf.predict(X_test)
print(classification_report(y_test, pred))

In [None]:
pred_submission = rcf.predict(predict_service[train_columns].select_dtypes(['float64', 'int64']))
predict_service['pred'] = pred_submission
predict_service.head()

## Randomforest GridSearchCV

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

X_train, X_test, y_train, y_test = train_test_split(train_service[train_columns].select_dtypes(['float64', 'int64']), 
                                                    train_service['Repurchase'], 
                                                    test_size=0.2, 
                                                    stratify=train_service['Repurchase'])

params = {
    'n_estimators': [300], #list(range(100, 501, 100)),
#     'max_features' : [int(np.sqrt(len(train_columns)))],
    'max_depth' : [6], #range(1, 10, 1),
    'min_samples_leaf': [5], #range(1, 11, 1),
    'min_samples_split': [2] #list(range(1, 11, 1))
}

rf_clf = RandomForestClassifier(random_state=0, n_jobs=-1)
grid_cv = GridSearchCV(rf_clf, param_grid=params, cv=5, n_jobs=-1, scoring='f1')
grid_cv.fit(X_train, y_train)
pred = grid_cv.best_estimator_.predict(X_test)
print(grid_cv.best_score_)
print(grid_cv.best_params_)
print(classification_report(y_test, pred))

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

X_train, X_test, y_train, y_test = train_test_split(train_service_target[train_columns].select_dtypes(['float64', 'int64']), 
                                                    train_service_target['Repurchase'], 
                                                    test_size=0.2, 
                                                    stratify=train_service_target['Repurchase'])

params = {
    'n_estimators': [300], #list(range(100, 501, 100)),
#     'max_features' : [int(np.sqrt(len(train_columns)))],
    'max_depth' : [6], #range(1, 10, 1),
    'min_samples_leaf': [5], #range(1, 11, 1),
    'min_samples_split': [2] #list(range(1, 11, 1))
}

rf_clf = RandomForestClassifier(random_state=0, n_jobs=-1)
grid_cv = GridSearchCV(rf_clf, param_grid=params, cv=5, n_jobs=-1, scoring='f1')
grid_cv.fit(X_train, y_train)
pred = grid_cv.best_estimator_.predict(X_test)
print(grid_cv.best_score_)
print(grid_cv.best_params_)
print(classification_report(y_test, pred))

In [None]:
pred_submission = grid_cv.predict(predict_service[train_columns].select_dtypes(['float64', 'int64']))
predict_service['pred'] = pred_submission
predict_service.head()

## SVM

In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV

import warnings
warnings.filterwarnings('ignore')

X_train, X_test, y_train, y_test = train_test_split(train_service[train_columns].select_dtypes(['float64', 'int64']), 
                                                    train_service['Repurchase'], 
                                                    test_size=0.2, 
                                                    stratify=train_service['Repurchase'])

svm = SVC(kernel='rbf', gamma=0.10, C=10.0)
svm.fit(X_train, y_train)
pred = svm.predict(X_test)
print(classification_report(y_test, pred))

## XGBClassifier

In [None]:
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV

import warnings
warnings.filterwarnings('ignore')

X_train, X_test, y_train, y_test = train_test_split(train_service[train_columns].select_dtypes(['float64', 'int64']), 
                                                    train_service['Repurchase'], 
                                                    test_size=0.2, 
                                                    stratify=train_service['Repurchase'])

# xgb_wrapper = XGBRFClassifier(n_estimators=400, learning_rate=0.1, max_depth=15)
# xgb_wrapper.fit(X_train, y_train)
# pred = xgb_wrapper.predict(X_test)

xgb_wrapper = XGBClassifier(objective = 'binary:logistic')
params = {
     'n_estimators': [300],
    'learning_rate' : [0.1],
    'max_depth': [20],
    'min_samples_split': [1]
}

grid_cv = GridSearchCV(xgb_wrapper, param_grid=params, cv=5, n_jobs=-1, scoring='f1')
grid_cv.fit(X_train, y_train)
pred = grid_cv.best_estimator_.predict(X_test)
print(grid_cv.best_score_)
print(grid_cv.best_params_)
print(classification_report(y_test, pred))

In [None]:
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV

import warnings
warnings.filterwarnings('ignore')

X_train, X_test, y_train, y_test = train_test_split(train_service_target[train_columns].select_dtypes(['float64', 'int64']), 
                                                    train_service_target['Repurchase'], 
                                                    test_size=0.2, 
                                                    stratify=train_service_target['Repurchase'])

xgb_wrapper = XGBClassifier(objective = 'binary:logistic')
params = {
     'n_estimators': [300],
    'learning_rate' : [0.1],
    'max_depth': [20],
    'min_samples_split': [1]
}

grid_cv = GridSearchCV(xgb_wrapper, param_grid=params, cv=5, n_jobs=-1, scoring='f1')
grid_cv.fit(X_train, y_train)
pred = grid_cv.best_estimator_.predict(X_test)
print(grid_cv.best_score_)
print(grid_cv.best_params_)
print(classification_report(y_test, pred))

In [15]:
col_list[:3]

['viewtime_mean', 'ChurnRateByProductcode', 'viewtime_std']

In [16]:
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV

import warnings
warnings.filterwarnings('ignore')

import numpy as np
from itertools import combinations

list(combinations(list(np.abs(train_service.corrwith(train_service['Repurchase'])).sort_values(ascending=False).index), 2))

col_list = list(np.abs(train_service.corrwith(train_service['Repurchase'])).sort_values(ascending=False).index)
col_list.remove('Repurchase')

scoreList = []
for i in range(1, len(col_list)+1):  
#     train_columns_comb = list(combinations(col_list, i))
#     for c in train_columns_comb:
    train_columns = col_list[:i]
    X_train, X_test, y_train, y_test = train_test_split(train_service[train_columns].select_dtypes(['float64', 'int64']), 
                                                        train_service['Repurchase'], 
                                                        test_size=0.2, 
                                                        stratify=train_service['Repurchase'])

    xgb_wrapper = XGBClassifier(objective = 'binary:logistic')
    params = {
        'n_estimators': [300],
        'learning_rate' : [0.1],
        'max_depth': [20],
        'min_samples_split': [1]
    }

    grid_cv = GridSearchCV(xgb_wrapper, param_grid=params, cv=5, n_jobs=-1, scoring='f1')
    grid_cv.fit(X_train, y_train)
    pred = grid_cv.best_estimator_.predict(X_test)
    # print(grid_cv.best_score_)
    # print(grid_cv.best_params_)
    # print(train_columns, ':', classification_report(y_test, pred))
    print(train_columns, classification_report(y_test, pred).split('\n')[2].split(' ')[30], classification_report(y_test, pred).split('\n')[3].split(' ')[30])
    scoreList.append([train_columns, classification_report(y_test, pred).split('\n')[2].split(' ')[30], classification_report(y_test, pred).split('\n')[3].split(' ')[30]])

pd.DataFrame(scoreList, columns=['columns', 'f1-0', 'f1-1'])

Parameters: { min_samples_split } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


['viewtime_mean'] 0.78 0.16
Parameters: { min_samples_split } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


['viewtime_mean', 'ChurnRateByProductcode'] 0.78 0.25
Parameters: { min_samples_split } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


['viewtime_mean', 'ChurnRateByProductco

['viewtime_mean', 'ChurnRateByProductcode', 'viewtime_std', 'agegroup', 'promo_100', 'pgamount', 'ChurnRateByChargetypeid', 'viewtime_sum', 'chargetypeid', 'tot_viewtime_sum'] 0.79 0.38
Parameters: { min_samples_split } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


['viewtime_mean', 'ChurnRateByProductcode', 'viewtime_std', 'agegroup', 'promo_100', 'pgamount', 'ChurnRateByChargetypeid', 'viewtime_sum', 'chargetypeid', 'tot_viewtime_sum', 'view_count'] 0.79 0.36
Parameters: { min_samples_split } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


['viewtime_mean', 'ChurnRate

['viewtime_mean', 'ChurnRateByProductcode', 'viewtime_std', 'agegroup', 'promo_100', 'pgamount', 'ChurnRateByChargetypeid', 'viewtime_sum', 'chargetypeid', 'tot_viewtime_sum', 'view_count', 'tot_view_count', 'concurrentwatchcount', 'coinReceived', 'devicetype_cnt', 'isauth', 'tot_viewtime_std', 'tot_viewtime_mean'] 0.80 0.39


Unnamed: 0,columns,f1-0,f1-1
0,[viewtime_mean],0.78,0.16
1,"[viewtime_mean, ChurnRateByProductcode]",0.78,0.25
2,"[viewtime_mean, ChurnRateByProductcode, viewti...",0.78,0.27
3,"[viewtime_mean, ChurnRateByProductcode, viewti...",0.78,0.32
4,"[viewtime_mean, ChurnRateByProductcode, viewti...",0.78,0.34
5,"[viewtime_mean, ChurnRateByProductcode, viewti...",0.78,0.34
6,"[viewtime_mean, ChurnRateByProductcode, viewti...",0.78,0.36
7,"[viewtime_mean, ChurnRateByProductcode, viewti...",0.78,0.36
8,"[viewtime_mean, ChurnRateByProductcode, viewti...",0.78,0.35
9,"[viewtime_mean, ChurnRateByProductcode, viewti...",0.79,0.38


In [19]:
resultDf

Unnamed: 0,columns,f1-0,f1-1
0,[viewtime_mean],0.78,0.16
1,"[viewtime_mean, ChurnRateByProductcode]",0.78,0.25
2,"[viewtime_mean, ChurnRateByProductcode, viewti...",0.78,0.27
3,"[viewtime_mean, ChurnRateByProductcode, viewti...",0.78,0.32
4,"[viewtime_mean, ChurnRateByProductcode, viewti...",0.78,0.34
5,"[viewtime_mean, ChurnRateByProductcode, viewti...",0.78,0.34
6,"[viewtime_mean, ChurnRateByProductcode, viewti...",0.78,0.36
7,"[viewtime_mean, ChurnRateByProductcode, viewti...",0.78,0.36
8,"[viewtime_mean, ChurnRateByProductcode, viewti...",0.78,0.35
9,"[viewtime_mean, ChurnRateByProductcode, viewti...",0.79,0.38


In [18]:
resultDf = pd.DataFrame(scoreList, columns=['columns', 'f1-0', 'f1-1'])
resultDf.to_csv('result.csv')

In [None]:
pred_submission = grid_cv.best_estimator_.predict(predict_service_target[train_columns].select_dtypes(['float64', 'int64']))
predict_service_target['pred'] = pred_submission
predict_service_target.head()

## XGBRFClassifier

### 1

In [None]:
from sklearn.model_selection import train_test_split
from xgboost import XGBRFClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV

import warnings
warnings.filterwarnings('ignore')

X_train, X_test, y_train, y_test = train_test_split(train_service[train_columns].select_dtypes(['float64', 'int64']), 
                                                    train_service['Repurchase'], 
                                                    test_size=0.2, 
                                                    stratify=train_service['Repurchase'])

# xgb_wrapper = XGBRFClassifier(n_estimators=400, learning_rate=0.1, max_depth=15)
# xgb_wrapper.fit(X_train, y_train)
# pred = xgb_wrapper.predict(X_test)

xgb_wrapper = XGBRFClassifier(objective = 'binary:logistic')
params = {
     'n_estimators': [300],
    'learning_rate' : [0.1],
    'max_depth': [20],
    'min_samples_split': [1, 2, 3, 4, 5]
}

grid_cv = GridSearchCV(xgb_wrapper, param_grid=params, cv=5, n_jobs=-1, scoring='f1')
grid_cv.fit(X_train, y_train)
pred = grid_cv.best_estimator_.predict(X_test)
print(grid_cv.best_score_)
print(grid_cv.best_params_)
print(classification_report(y_test, pred))

In [None]:
from sklearn.model_selection import train_test_split
from xgboost import XGBRFClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV

import warnings
warnings.filterwarnings('ignore')

X_train, X_test, y_train, y_test = train_test_split(train_service_target[train_columns].select_dtypes(['float64', 'int64']), 
                                                    train_service_target['Repurchase'], 
                                                    test_size=0.2, 
                                                    stratify=train_service_target['Repurchase'])

# xgb_wrapper = XGBRFClassifier(n_estimators=400, learning_rate=0.1, max_depth=15)
# xgb_wrapper.fit(X_train, y_train)
# pred = xgb_wrapper.predict(X_test)

xgb_wrapper = XGBRFClassifier(objective = 'binary:logistic')
params = {
     'n_estimators': [300],
    'learning_rate' : [0.1],
    'max_depth': [20],
    'min_samples_split': [1]
}

grid_cv = GridSearchCV(xgb_wrapper, param_grid=params, cv=5, n_jobs=-1, scoring='f1')
grid_cv.fit(X_train, y_train)
pred = grid_cv.best_estimator_.predict(X_test)
print(grid_cv.best_score_)
print(grid_cv.best_params_)
print(classification_report(y_test, pred))

In [None]:
predict_service.shape

In [None]:
pred_submission = grid_cv.best_estimator_.predict(predict_service[train_columns].select_dtypes(['float64', 'int64']))
predict_service['pred'] = pred_submission
predict_service.head()

# Submission

In [None]:
# 결과 제출 답안지 불러오기
# predict_service['Churn'] = predict_service['pred'].apply(lambda x: 0 if x==1 else 1)
predict_service['Churn'] = predict_service['pred']

ds_sheet = "/content/drive/MyDrive/SK_AI/CDS_submission.csv"
df_sheet = pd.read_csv(ds_sheet)
df_sheet.drop('CHURN', axis=1, inplace=True)
df_sheet.info()

# 답안지에 답안 표기
df_result = predict_service.loc[:,('uno','registerdate','productcode')]
df_result['KEY']   = df_result['uno'] + '|' + df_result['registerdate'].dt.strftime('%y-%m-%d %I:%M:%S') + '|' + df_result['productcode']   # 판다스 strftime()
df_result['CHURN'] = predict_service['Churn']
df_result = df_result.loc[:,('KEY','CHURN')]
df_answer_sheet = pd.merge(df_sheet, df_result, on='KEY', how='left')


# 답안지 제출 파일 생성하기
ds_answer_sheet = "CDS_submission_XGBRFClassifier.csv"
df_answer_sheet.to_csv(ds_answer_sheet, index=False, encoding='utf8')

In [None]:
# 결과 제출 답안지 불러오기
# predict_service['Churn'] = predict_service['pred'].apply(lambda x: 0 if x==1 else 1)
predict_service_target['Churn'] = predict_service_target['pred']

ds_sheet = "/content/drive/MyDrive/SK_AI/CDS_submission.csv"
df_sheet = pd.read_csv(ds_sheet)
df_sheet.drop('CHURN', axis=1, inplace=True)
df_sheet.info()

# 답안지에 답안 표기
df_result = predict_service.loc[:,('uno','registerdate','productcode')]
df_result['KEY']   = df_result['uno'] + '|' + df_result['registerdate'].dt.strftime('%y-%m-%d %I:%M:%S') + '|' + df_result['productcode']   # 판다스 strftime()
df_result['CHURN'] = predict_service_target['Churn']
df_result = df_result.loc[:,('KEY','CHURN')]
df_answer_sheet = pd.merge(df_sheet, df_result, on='KEY', how='left')
df_answer_sheet['CHURN'] = df_answer_sheet['CHURN'].apply(int)
# 답안지 제출 파일 생성하기
ds_answer_sheet = "CDS_submission_XGBClassifier_target.csv"
df_answer_sheet.to_csv(ds_answer_sheet, index=False, encoding='utf8')

In [None]:
df_answer_sheet.head()

In [None]:
predict_service[predict_service['uno'] == '7de6c80c6cb5c5098bbfef8d9da75dfdd338b681d96691b420cbddbf35b2cb878a07272fd1a3d9eb2506a19f27738f05dfc8627da765152ccc1a44e565b4a86e|21-03-14 10:55:38|pk_2025']

In [None]:
predict_service[predict_service['uno'] == '7de6c80c6cb5c5098bbfef8d9da75dfdd338b681d96691b420cbddbf35b2cb878a07272fd1a3d9eb2506a19f27738f05dfc8627da765152ccc1a44e565b4a86e']

In [None]:
predict_service.head(10)

In [None]:
predict_service.shape