In [1]:
import os
import pandas as pd
import numpy as np
import math
from matplotlib.pyplot import *
import matplotlib.pyplot as plt
from matplotlib import animation
from matplotlib import cm
from sklearn.cluster import KMeans
from sklearn.neighbors import KNeighborsClassifier
from dateutil import parser
import io
import base64
from IPython.display import HTML
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier


In [2]:
chunk_size = 100000

chunk_num_train = pd.read_csv("../../data/train_numeric.csv",     chunksize = chunk_size)
chunk_dat_train = pd.read_csv("../../data/train_date.csv"   ,     chunksize = chunk_size)
chunk_cat_train = pd.read_csv("../../data/train_categorical.csv", chunksize = chunk_size)

df_num_train = chunk_num_train.get_chunk(chunk_size)
df_dat_train = chunk_dat_train.get_chunk(chunk_size)
df_cat_train = chunk_cat_train.get_chunk(chunk_size)

  return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)


 
OKデータをアンダーサンプリングする

In [3]:
# 全部 NaN な列をdropして、Nanを大きな値で置換。
print("before" + str(df_num_train.shape))
df_num_train.dropna(how='all', axis=1, inplace=True)
df_num_train.fillna(999, inplace=True)
print("after" + str(df_num_train.shape))

before(100000, 970)
after(100000, 970)


In [4]:
df_num, df_validation = train_test_split(df_num_train)

カテゴリカルデータも対応するIDだけ残す

In [5]:
# カテゴリ変数のパース(仮実装で、値に変換するだけ)
# 最終的にはビットマップっぽいので、エンコーディングする。
from parse import *
def cat_data_to_val(x):
    if type(x) is str:
        r = parse("T{}", x)
        return float(r[0])
    else:
        return x


In [6]:

def is_bit_on(s, pos):
    if type(s) is str:
        r = parse("T{}", s)
        val = (int(r[0]))
        mask = 1 << pos
        if val & mask:
            return 1
        else:
            return 0
    else:
        return 0

In [None]:
def decode_categorical_data(df):
    for column_name, item in df.iteritems():
        if column_name != 'Id':
            vallist = item.unique()
            bitlist = []
            for c in vallist:
                if type(c) is str:
                    r = parse("T{}", c)
                    val = int(r[0])
                    if val < 0:
                        val = val + 4294967296
                    for bitpos in range(0, 32):                        
                        mask = 1 << bitpos
                        if val & mask: 
                            if (bitpos not in bitlist):
                                bitlist.append(bitpos)

            for bit in bitlist:
                df[column_name + "_bit_" + str(bit)] = df[column_name].apply(lambda x:is_bit_on(x, bit)).astype('int8')
            df.drop(column_name, axis=1, inplace=True)

    

In [None]:
decode_categorical_data(df_cat_train)

In [None]:
df_num_ok = df_num[df_num['Response'] == 0]
df_num_ng = df_num[df_num['Response'] == 1]

undersample_rate = len(df_num_ng) * 20 / len(df_num_ok)
df_num_ok_sample = df_num_ok.sample(frac = undersample_rate)

df_num_balance = pd.concat([df_num_ok_sample, df_num_ng])
print('Response = 0のテーブル', df_num_ok.shape)
print('Response = 1のテーブル', df_num_ng.shape)
print('Response = 0をアンダーサンプルした後のフレーム', df_num_balance.shape)

In [None]:
df_cat_train.columns

In [None]:
df_train = pd.merge(df_num_balance, df_cat_train, on=['Id'], how = 'left')
df_test  = pd.merge(df_validation, df_cat_train, on=['Id'], how = 'left')


In [None]:
print(df_train[df_train.isnull().any(axis=1)])

In [None]:
import xgboost as xgb


def separate_X_y(df):
    X = df.drop(['Response'], axis=1)
    y = df['Response']
    return X, y

 

In [None]:


def train_with_r_forest(df):
    X_train, y_train = separate_X_y(df)

    cl_weight = {0:0.3, 1:0.7}
    
    rf = RandomForestClassifier(max_depth=100,n_estimators=200)
    rf.fit(X_train, y_train)
    return rf





In [None]:
rf = train_with_r_forest(df_train)
X_test, y_test = separate_X_y(df_test)
y_pred = rf.predict(X_test)
y_pred

In [None]:

from sklearn.metrics import confusion_matrix
cf_mat = confusion_matrix(y_test, y_pred)
cf_mat

In [None]:
def calc_mcc(cf_mat):
    tn, fp, fn, tp = cf_mat.ravel()
    print(tn, fp, fn, tp)
    mcc = (tp * tn - fp * fn) / np.sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))
    return mcc

mcc = calc_mcc(cf_mat)
print(mcc)
    

In [None]:
def plot_feature_importance(clf, df):
    #特徴量の重要度
    feature = clf.feature_importances_

    #特徴量の重要度を上から順に出力する
    f = pd.DataFrame({'number': range(0, len(feature)),
                 'feature': feature[:], 'name':''})
    f2 = f.sort_values('feature',ascending=False)
    f3 = f2.ix[:, 'number']

    #特徴量の名前
    label = df.columns[0:]

    #特徴量の重要度順（降順）
    indices = np.argsort(feature)[::-1]

    for i in range(len(feature)):
        print(str(i + 1) + "   " + str(label[indices[i]]) + "   " + str(feature[indices[i]]))
        f2.iloc[i, 1]  = str(label[indices[i]])

    plt.title('Feature Importance')
    plt.bar(range(len(feature)),feature[indices], color='lightblue', align='center')
    plt.xticks(range(len(feature)), label[indices], rotation=90)
    plt.xlim([-1, len(feature)])
    plt.tight_layout()
    plt.show()
    
    return f2


f_importance = plot_feature_importance(rf, df_train)
f_importance.to_csv("../../data/bosch_0620_features.csv", index=False)

In [None]:
df_desc.loc['max', np.isnan(df_desc.loc['max', :])]

In [None]:



from sklearn.metrics import matthews_corrcoef
matthew_scorer = make_scorer(matthews_corrcoef)
scores = cross_val_score(clf, X, y, cv=5, scoring=matthew_scorer)

In [None]:
df_cat

In [None]:
df_num