## GCI solution

train.csvは重いのでupしてないです。

In [1]:
import numpy as np
import pandas as pd
import itertools
from tqdm import tqdm

from sklearn.preprocessing import OneHotEncoder

from sklearn.metrics import roc_auc_score
from lightgbm.sklearn import LGBMClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_predict, KFold

### import

In [2]:
train = pd.read_csv("../data/train.csv")
test = pd.read_csv("../data/test.csv")

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
# UnderSamplingに用いるモデルの数
us_num  =15

## Target Encoding用のクラスの準備

In [4]:
class TargetEncoding:
    def __init__(self, col_names, target):
        self.col_names = col_names
        self.target = target

    """trainデータ用"""
    def mean_train_encoding(self, df):
        y_tr = df[self.target].values
        skf = StratifiedKFold(5, shuffle = True, random_state=0)
        # TE後の名前を用意する.
        for col in self.col_names:
            df[col + '_mean_encoded'] = np.nan

        # trn : training, val : validation
        for trn_ind , val_ind in skf.split(df, y_tr):
            x_tr, x_val = df.iloc[trn_ind], df.iloc[val_ind]
            for col in self.col_names:
                tr_mean = x_tr.groupby(col)[self.target].mean()
                mean = x_val[col].map(tr_mean)
                df[col + '_mean_encoded'].iloc[val_ind] = mean

        prior = df[self.target].mean()
        for col in self.col_names:
            df[col + '_mean_encoded'].fillna(prior, inplace = True)

        return df

    """"testデータ用"""
    def mean_test_encoding(self, df_trn, df_tst):
        # TE後の名前を用意する.
        for col in self.col_names:
            df_tst[col + '_mean_encoded'] = np.nan

        # encoding用の前処理
        for col in self.col_names:
            tr_mean = df_trn.groupby(col)[self.target].mean()
            mean = df_tst[col].map(tr_mean)
            df_tst[col + '_mean_encoded'] = mean

        prior = df_trn[self.target].mean()
        # testにはあるがtrainにはないカテゴリに対して平均値を入れる.
        for col in self.col_names:
            df_tst[col + '_mean_encoded'].fillna(prior, inplace = True)

        return df_tst

## 使用するカテゴリカル変数の用意

In [5]:
# カテゴリカル変数の確認
print(train.select_dtypes(["object"]).columns)

Index(['state', 'stop_date', 'stop_time', 'location_raw', 'county_name',
       'fine_grained_location', 'police_department', 'driver_gender',
       'driver_race_raw', 'driver_race', 'violation_raw', 'violation',
       'search_type_raw', 'search_type', 'officer_id', 'stop_duration'],
      dtype='object')


In [6]:
# カテゴリカル変数で同じカテゴリに対してrawとrawじゃないもの両方があればrawを残したカラムのリスト
# stateは中身がすべて同じなので取り除いた。
columns_name_raw = ['stop_date', 
                    'stop_time', 
                    'location_raw', 
                    'county_name',
                    'fine_grained_location', 
                    'police_department', 
                    'driver_gender',
                    'driver_race_raw', 
                    'violation_raw',
                    'search_type_raw', 
                    'officer_id', 
                    'stop_duration']

columns_name_raw_list = list(itertools.combinations(columns_name_raw, 2))

## Count Encoding(CE)

In [7]:
# trainとtestの結合
trts_data = pd.concat([train, test], sort=False)

In [8]:
# 文字列としてカラム同士を結合したものを新たな記述子とする。
str_collocation_trts_dct = {"for_count":list(np.ones(len(trts_data)))}
col_drop_list = []

for col_comb in tqdm(columns_name_raw_list):
    str_collocation_trts_dct[str(col_comb[0]) + "_" + str(col_comb[1])] = trts_data[col_comb[0]].astype("str") + trts_data[col_comb[1]].astype("str")
    col_drop_list.append(str(col_comb[0]) + "_" + str(col_comb[1]))
    
str_collocation_trts_df = pd.DataFrame(str_collocation_trts_dct)

100%|██████████████████████████████████████████████████████████████████████████████████| 66/66 [00:08<00:00,  7.83it/s]


In [9]:
# Count Encoding(CE)
for_groupby_names = str_collocation_trts_df.columns.drop(["for_count"])
CE_dct = {}

for i in tqdm(for_groupby_names):
    CE_dct[i + "_count"] = str_collocation_trts_df[i].map(str_collocation_trts_df.groupby(i).count()["for_count"])
    
trts_count_df = pd.DataFrame(CE_dct)

100%|██████████████████████████████████████████████████████████████████████████████████| 66/66 [02:38<00:00,  2.39s/it]


In [10]:
# trainとtestの分割
train_count_df = trts_count_df.iloc[:len(train), :]
test_count_df = trts_count_df.iloc[len(train):, :]

## Target Encoding(TE)

In [11]:
# TEのための処理
TE_train_df = str_collocation_trts_df.iloc[:len(train), :]
TE_test_df = str_collocation_trts_df.iloc[len(train):, :]

TE_train_df["is_arrested"] = train.loc[:, ["is_arrested"]]
TE_test_df["is_arrested"] = np.nan

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [12]:
# Target Encoding
TE_train_arrest = TargetEncoding(TE_train_df.columns, "is_arrested")
TE_train = TE_train_arrest.mean_train_encoding(TE_train_df)
TE_test = TE_train_arrest.mean_test_encoding(TE_train_df, TE_test_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row

In [13]:
# カラムにarrestedや元のカテゴリカル変数が含まれたままなので削除する。
col_drop_list.extend(["is_arrested", "is_arrested_mean_encoded", "for_count", "for_count_mean_encoded"])
col_name_comb_list = TE_test.columns.drop(col_drop_list)

TE_train = TE_train.loc[:, col_name_comb_list]
TE_test = TE_test.loc[:, col_name_comb_list]

## TEとCEの結合

In [14]:
concat_train = pd.concat([TE_train, train_count_df], axis=1)
concat_test = pd.concat([TE_test, test_count_df], axis=1)

## 計算

In [15]:
# UnderSampling
y_train = train.loc[:, "is_arrested"]
y_arrested = y_train[y_train==1]
y_us_list =[]
X_us_list =[]

for i in tqdm(range(us_num)):
    y_Notarrested = y_train[y_train!=1]
    
    # y_arrestedと同じ数だけのデータをランダムに取ってくる。
    y_Notarrested_RandomSampled = y_Notarrested.reindex(np.random.permutation(y_Notarrested.index)).iloc[:len(y_arrested)]
    y_concatenated = pd.concat([y_arrested, y_Notarrested_RandomSampled])
    
    # リストに格納
    y_us_list.append(y_concatenated)
    X_us_list.append(concat_train.loc[y_concatenated.index])

100%|██████████████████████████████████████████████████████████████████████████████████| 15/15 [00:01<00:00, 12.92it/s]


In [16]:
# 予測
result_prob_dct = {}
counter = 0
for i, j in tqdm(zip(X_us_list, y_us_list)):
    counter += 1
    # モデルの用意
    clf = LGBMClassifier(n_jobs=5)
    clf.fit(i, j)
    result_prob_dct["LGBM_model_" + str(counter)] = clf.predict_proba(concat_test).T[1]

15it [00:37,  2.49s/it]


In [17]:
# 平均値をとる.
y_pred = pd.DataFrame(result_prob_dct).mean(axis=1)

## モデル精度確認

In [18]:
clf = LGBMClassifier()
kf = KFold(n_splits=5, random_state=0, shuffle=True)
pred_cv = cross_val_predict(clf, concat_train, train["is_arrested"], cv = kf, method='predict_proba')

In [19]:
roc_auc_score(train["is_arrested"], pred_cv[:, 1])

0.9082975242113124