In [1]:
import time
import numpy as np
import pandas as pd
import pickle
import dask.array as da
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn import svm
from sklearn import neighbors
from sklearn import naive_bayes
from sklearn.svm import LinearSVC
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.gaussian_process import GaussianProcessClassifier

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import VotingClassifier

from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

from sklearn.externals import joblib

%config InlineBackend.figure_format = 'svg'
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

In [2]:
safe_type = pd.read_csv("origin_data.csv")["safe_type"]
train_call_pid = pd.read_csv("origin_data.csv")["call_pid"]
test_call_pid = pd.read_csv("origin_test.csv")["call_pid"]

In [8]:
vectorizes = TfidfVectorizer(ngram_range=(1, 5), min_df=3, max_df=0.9)
train_call_pid_tfidf = vectorizes.fit_transform(train_call_pid.tolist())
test_call_pid_tfidf = vectorizes.transform(test_call_pid.tolist())

In [12]:
with open("train_call_pid_tfidf.pkl", "wb") as fp:
    pickle.dump(train_call_pid_tfidf, fp)
with open("test_call_pid_tfidf.pkl", "wb") as fp:
    pickle.dump(test_call_pid_tfidf, fp)

In [3]:
train_call_pid_tfidf = pd.read_pickle("train_call_pid_tfidf.pkl")
test_call_pid_tfidf = pd.read_pickle("test_call_pid_tfidf.pkl")

In [4]:
bc_model = BaggingClassifier()
gbc_model = GradientBoostingClassifier()
lr_model = LogisticRegression()
svm_model = svm.LinearSVC()
dt_model = DecisionTreeClassifier()
xgb_model = XGBClassifier(max_depth=7,
                          learning_rate=0.05,
                          n_estimators=1000)

rfc_model = RandomForestClassifier(200)
etc_model = ExtraTreesClassifier()
mnb_model = naive_bayes.MultinomialNB(alpha=0.01)
ada_model = AdaBoostClassifier()

In [5]:
def get_oof(model, x_train, y_train, x_test, n_splits):
    """
    :@param x_train: feature matrix.
    :type x: np.array(M X N) or list(M X N).
    :@param y_train: class label.
    :type y: int.
    :@param x_test: test set feature matrix.
    :type x_test: np.array(M X N) or list(M X N).
    :@param n_splits: K-fold parameter.
    :type n_splits: int.
    """
    n_train, n_test = x_train.shape[0], x_test.shape[0]
    kf = StratifiedKFold(n_splits=n_splits, random_state=0)
    oof_train = np.empty((n_train, ))
    oof_test = np.empty((n_test, ))
    oof_test_skf = np.empty((n_splits, n_test))
    for i, (train_index, test_index) in enumerate(kf.split(x_train, y_train)):
        kf_x_train = x_train[train_index]
        kf_y_train = y_train[train_index]
        kf_x_test = x_train[test_index]
        model.fit(kf_x_train, kf_y_train)
        oof_train[test_index] = model.predict(kf_x_test)
        oof_test_skf[i, :] = model.predict(x_test)
    oof_test[:] = oof_test_skf.mean(axis=0)
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)

In [None]:
train_tfidf_features = train_call_pid_tfidf
test_tfidf_features = test_call_pid_tfidf
try:
    lr_model_oof_train, lr_model_oof_test = get_oof(lr_model, 
                                                    train_tfidf_features.tolil(), 
                                                    safe_type.values,
                                                    test_tfidf_features.tolil(),
                                                    10)
    with open("call_pid_lr_model_oof_train.csv", "wb") as fp:
        pickle.dump(lr_model_oof_train, fp)
    with open("call_pid_lr_model_oof_test.csv", "wb") as fp:
        pickle.dump(lr_model_oof_test, fp)
    print("lr success!")
except:
    print("lr error!")
try:
    gbc_model_oof_train, gbc_model_oof_test = get_oof(gbc_model, 
                                                      train_tfidf_features.tolil(), 
                                                      safe_type.values,
                                                      test_tfidf_features.tolil(),
                                                      10)
    with open("call_pid_gbc_model_oof_train.csv", "wb") as fp:
        pickle.dump(gbc_model_oof_train, fp)
    with open("call_pid_gbc_model_oof_test.csv", "wb") as fp:
        pickle.dump(gbc_model_oof_test, fp)
    print("gbc success!")
except:
    print("gbc error!")
try:
    bc_model_oof_train, bc_model_oof_test = get_oof(bc_model, 
                                                    train_tfidf_features.tolil(), 
                                                    safe_type.values,
                                                    test_tfidf_features.tolil(),
                                                    10)
    with open("call_pid_bc_model_oof_train.csv", "wb") as fp:
        pickle.dump(bc_model_oof_train, fp)
    with open("call_pid_bc_model_oof_test.csv", "wb") as fp:
        pickle.dump(bc_model_oof_test, fp)
    print("bc success!")
except:
    print("bc error!")
try:
    svm_model_oof_train, svm_model_oof_test = get_oof(svm_model, 
                                                      train_tfidf_features.tolil(), 
                                                      safe_type.values,
                                                      test_tfidf_features.tolil(),
                                                      10)
    with open("call_pid_svm_model_oof_train.csv", "wb") as fp:
        pickle.dump(svm_model_oof_train, fp)
    with open("call_pid_svm_model_oof_test.csv", "wb") as fp:
        pickle.dump(svm_model_oof_test, fp)
    print("svm success!")
except:
    print("svm error!")
try:
    dt_model_oof_train, dt_model_oof_test = get_oof(dt_model, 
                                                      train_tfidf_features.tolil(), 
                                                      safe_type.values,
                                                      test_tfidf_features.tolil(),
                                                      10)
    with open("call_pid_dt_model_oof_train.csv", "wb") as fp:
        pickle.dump(dt_model_oof_train, fp)
    with open("call_pid_dt_model_oof_test.csv", "wb") as fp:
        pickle.dump(dt_model_oof_test, fp)
    print("dt success!")
except:
    print("dt error!")

    
try:
    rfc_model_oof_train, rfc_model_oof_test = get_oof(rfc_model, 
                                                      train_tfidf_features.tolil(), 
                                                      safe_type.values,
                                                      test_tfidf_features.tolil(),
                                                      10)
    with open("call_pid_rfc_model_oof_train.csv", "wb") as fp:
        pickle.dump(rfc_model_oof_train, fp)
    with open("call_pid_rfc_model_oof_test.csv", "wb") as fp:
        pickle.dump(rfc_model_oof_test, fp)
    print("rfc success!")
except:
    print("rfc error!")
    
try:
    etc_model_oof_train, etc_model_oof_test = get_oof(etc_model, 
                                                      train_tfidf_features.tolil(), 
                                                      safe_type.values,
                                                      test_tfidf_features.tolil(),
                                                      10)
    with open("call_pid_etc_model_oof_train.csv", "wb") as fp:
        pickle.dump(etc_model_oof_train, fp)
    with open("call_pid_etc_model_oof_test.csv", "wb") as fp:
        pickle.dump(etc_model_oof_test, fp)
    print("etc success!")
except:
    print("etc error!")
try:
    mnb_model_oof_train, mnb_model_oof_test = get_oof(mnb_model, 
                                                      train_tfidf_features.tolil(), 
                                                      safe_type.values,
                                                      test_tfidf_features.tolil(),
                                                      10)
    with open("call_pid_mnb_model_oof_train.csv", "wb") as fp:
        pickle.dump(mnb_model_oof_train, fp)
    with open("call_pid_mnb_model_oof_test.csv", "wb") as fp:
        pickle.dump(mnb_model_oof_test, fp)
    print("mnb success!")
except:
    print("mnb error!")
    
try:
    ada_model_oof_train, ada_model_oof_test = get_oof(ada_model, 
                                                      train_tfidf_features.tolil(), 
                                                      safe_type.values,
                                                      test_tfidf_features.tolil(),
                                                      10)
    with open("call_pid_ada_model_oof_train.csv", "wb") as fp:
        pickle.dump(ada_model_oof_train, fp)
    with open("call_pid_ada_model_oof_test.csv", "wb") as fp:
        pickle.dump(ada_model_oof_test, fp)
    print("ada success!")
except:
    print("ada error!")

try:
    xgb_model_oof_train, xgb_model_oof_test = get_oof(xgb_model, 
                                                      train_tfidf_features.tolil(), 
                                                      safe_type.values,
                                                      test_tfidf_features.tolil(),
                                                      10)
    with open("call_pid_xgb_model_oof_train.csv", "wb") as fp:
        pickle.dump(xgb_model_oof_train, fp)
    with open("call_pid_xgb_model_oof_test.csv", "wb") as fp:
        pickle.dump(xgb_model_oof_test, fp)
    print("xgb success!")
except:
    print("xgb error!")


call_pid_stacking_train_10 = np.hstack([lr_model_oof_train, gbc_model_oof_train, bc_model_oof_train,
                            svm_model_oof_train, xgb_model_oof_train, dt_model_oof_train,
                            rfc_model_oof_train, etc_model_oof_train, mnb_model_oof_train,
                            ada_model_oof_train])
call_pid_stacking_test_10 = np.hstack([lr_model_oof_test, gbc_model_oof_test, bc_model_oof_test,
                           svm_model_oof_test, xgb_model_oof_test, dt_model_oof_test,
                           rfc_model_oof_test, etc_model_oof_test, mnb_model_oof_test,
                           ada_model_oof_test])
with open("call_pid_stacking_train_10.pkl", "wb") as fp:
    pickle.dump(call_pid_stacking_train_10, fp)
    
with open("call_pid_stacking_test_10.pkl", "wb") as fp:
    pickle.dump(call_pid_stacking_test_10, fp)