In [420]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import *
from sklearn.preprocessing import *
from sklearn.metrics import *
from sklearn.impute import *
from sklearn.decomposition import *
from sklearn.feature_selection import *

import category_encoders as ce
from collections import defaultdict

## 1. Wczytanie danych

In [421]:
train_df = pd.read_csv("train.txt", sep = " ").sort_index()
test_df = pd.read_csv("testx.txt", sep = " ").sort_index()

## 2. Usunięcie pustych kolumn

In [422]:
def drop_empty_columns(df):
    columns_to_drop =[]
    for column in df.columns:
        if df[column].isna().all():
            columns_to_drop.append(column)
    if columns_to_drop:
        ret_df = df.drop(columns=columns_to_drop)
    return ret_df

In [423]:
train_df = drop_empty_columns(train_df)

## 3. Zakodowanie kolumn typu str, obj do category

In [424]:
# def object_as_category(df):
#     new_df = df.copy()
#     categorical_columns = new_df.select_dtypes(exclude=["number"]).columns
#     try:
#         new_df[categorical_columns] = new_df[categorical_columns].fillna("NAN").astype("category")
#         return new_df
#     except ValueError:
#         print("No categorical columns")

In [425]:
# train_df = object_as_category(train_df)

## 4. Usunięcie kolumn kategorycznych powtarzających się

In [426]:
def count_categories_in_columns(df):
    cat_df = df.select_dtypes(exclude=["number"])
    num_unique_values_map = defaultdict(list)
    for i in range(0, cat_df.shape[1]):
        column = cat_df.iloc[:, i]
        num_unique = column.nunique(dropna=False)
        num_unique_values_map[num_unique].append(column.name)
    return num_unique_values_map

In [427]:
num_unique_values_map = count_categories_in_columns(train_df)
num_unique_values_map

defaultdict(list,
            {2: ['Var191', 'Var211', 'Var213', 'Var215', 'Var224'],
             352: ['Var192'],
             51: ['Var193'],
             4: ['Var194', 'Var205', 'Var225'],
             23: ['Var195', 'Var219', 'Var226'],
             3: ['Var196', 'Var201', 'Var208', 'Var218'],
             214: ['Var197'],
             3876: ['Var198', 'Var220', 'Var222'],
             4384: ['Var199'],
             13331: ['Var200', 'Var214'],
             5553: ['Var202'],
             6: ['Var203', 'Var210'],
             100: ['Var204'],
             22: ['Var206'],
             14: ['Var207'],
             80: ['Var212'],
             1834: ['Var216'],
             12495: ['Var217'],
             7: ['Var221', 'Var227'],
             5: ['Var223', 'Var229'],
             30: ['Var228']})

In [428]:
def drop_repeated_cat_columns(df, num_unique_values_map):
    columns_to_drop = set()
    for key, value in num_unique_values_map.items():
        len_col_list = len(value)
        if len_col_list > 1:
            for i in range(len_col_list):
                col_name_i = value[i]
                for j in range(i+1, len_col_list):
                    col_name_j = value[j]
                    lab_encoder = ce.OrdinalEncoder()
                    transformed = lab_encoder.fit_transform(df[[col_name_i, col_name_j]])
                    if accuracy_score(transformed.iloc[:, 0], transformed.iloc[:, 1]) > 0.99:
                        columns_to_drop.add(col_name_i)
                        #print("Break" ,key,  i, j, accuracy_score(transformed.iloc[:, 0], transformed.iloc[:, 1]))
                        break
    return df.drop(columns=columns_to_drop)

In [429]:
train_df = drop_repeated_cat_columns(train_df, num_unique_values_map)

## 5. Problem niewystępujących kategorii w zbiorze treningowym

In [430]:
mod_train = train_df.iloc[:,:-1]
joined_df = pd.concat([mod_train, test_df], sort=False).sort_index()
count_categories_in_columns(joined_df)

defaultdict(list,
            {2: ['Var191', 'Var211', 'Var213', 'Var215', 'Var224'],
             362: ['Var192'],
             51: ['Var193'],
             4: ['Var194', 'Var196', 'Var205', 'Var225'],
             23: ['Var195', 'Var219', 'Var226'],
             226: ['Var197'],
             5074: ['Var199'],
             3: ['Var201', 'Var208', 'Var218'],
             5714: ['Var202'],
             6: ['Var203', 'Var210'],
             100: ['Var204'],
             22: ['Var206'],
             14: ['Var207'],
             81: ['Var212'],
             15416: ['Var214'],
             2016: ['Var216'],
             13991: ['Var217'],
             7: ['Var221', 'Var227'],
             4291: ['Var222'],
             5: ['Var223', 'Var229'],
             30: ['Var228'],
             2049: ['Var198', 'Var220'],
             4322: ['Var200']})

##  6. Funkcja licząca liczbę kategorii o konkretnej liczbie wystąpień

In [431]:
def count_categories_by_size(df):
    columns_dict = {}
    cat_columns = df.select_dtypes(exclude=["number"]).columns
    for column in cat_columns:
        columns_dict[column] = train_df[column].value_counts(dropna=False).to_frame().groupby(column).size()
    return columns_dict

In [432]:
column_dict_categories_by_size = count_categories_by_size(train_df)

## 7. Funkcja określająca frakcję jedynek i liczność w danej kategorii

In [433]:
def count_ones_fraction_and_size_in_category(df):
    columns_dict = {}
    cat_columns = df.select_dtypes(exclude=["number"]).columns
    for column in cat_columns:
        ones_fraction = df[[column, "class"]].fillna(-1).groupby(column)["class"].mean()
        cat_size = df[[column]].fillna(-1).groupby(column).size()
        columns_dict[column] = pd.DataFrame(data={"ones_fraction": ones_fraction, "cat_size": cat_size})
    return columns_dict

In [434]:
column_dict_categories_description = count_ones_fraction_and_size_in_category(train_df)

In [435]:
column_dict_categories_description["Var197"].head()

Unnamed: 0_level_0,ones_fraction,cat_size
Var197,Unnamed: 1_level_1,Unnamed: 2_level_1
-1,0.025424,118
0LaQ,0.068627,102
0WHw,0.111111,180
0Xwj,0.076087,3680
0Y9G,0.083582,335


## 8. Zdropowanie kolumn z kategoriami o liczności więcej niż 

In [436]:
def drop_columns_above_n_categories(df, n):
    columns_to_drop = set()
    cat_columns = df.select_dtypes(exclude=["number"]).columns
    for column in cat_columns:
        if train_df[column].nunique(dropna=False) > n:
            columns_to_drop.add(column)
    return df.drop(columns=columns_to_drop)

In [437]:
train_df = drop_columns_above_n_categories(train_df, 100)

## 9. Wypełnienie pustych zmiennych numerycznych i dodanie kolumn informujących o NaN

In [438]:
def fill_na_in_numerical_add_column(df):
    new_df = df.copy()
    numerical_columns = new_df.select_dtypes(include=["number"]).columns
    for column in numerical_columns:
        if new_df[column].isna().any():
            new_df[column+"_isfilled"] = new_df[column].isna().map({True:1, False:0})
        new_df[column] = new_df[column].fillna(new_df[column].median())
    return new_df

In [439]:
train_df = fill_na_in_numerical_add_column(train_df)

In [440]:
train_df.shape

(40000, 373)

## 10. Target encoding

In [459]:
import xgboost
import lightgbm
import catboost
from sklearn.ensemble import RandomForestClassifier

In [460]:
def calculate_scores(y_true, y_pred):
    acc = accuracy_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    return acc, recall, precision

In [461]:
# cat_columns = train_df.select_dtypes(exclude=["number"]).columns
# for column in cat_columns:
#     train_df[column] = train_df[column].astype("category").cat.codes

In [462]:
label_encoder = ce.BinaryEncoder()
#label_encoder = ce.OrdinalEncoder()
X = train_df.loc[:, train_df.columns != "class"]
y = train_df["class"]
label_encoder.fit(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=123)

In [463]:
X_train = label_encoder.transform(X_train)
X_test = label_encoder.transform(X_test)

#### XGBModel

In [464]:
xgb_model = xgboost.XGBClassifier(
 learning_rate =0.1,
 n_estimators=1000)
eval_set = [(X_test, y_test)]
xgb_model.fit(X_train, y_train, early_stopping_rounds=100, eval_metric="logloss", eval_set=eval_set, verbose=True)

[0]	validation_0-logloss:0.614884
Will train until validation_0-logloss hasn't improved in 100 rounds.
[1]	validation_0-logloss:0.550759
[2]	validation_0-logloss:0.497555
[3]	validation_0-logloss:0.452855
[4]	validation_0-logloss:0.414973
[5]	validation_0-logloss:0.382556
[6]	validation_0-logloss:0.354851
[7]	validation_0-logloss:0.330882
[8]	validation_0-logloss:0.310194
[9]	validation_0-logloss:0.292144
[10]	validation_0-logloss:0.276574
[11]	validation_0-logloss:0.262958
[12]	validation_0-logloss:0.251026
[13]	validation_0-logloss:0.240696
[14]	validation_0-logloss:0.231622
[15]	validation_0-logloss:0.223584
[16]	validation_0-logloss:0.216673
[17]	validation_0-logloss:0.21058
[18]	validation_0-logloss:0.205211
[19]	validation_0-logloss:0.200513
[20]	validation_0-logloss:0.196394
[21]	validation_0-logloss:0.192805
[22]	validation_0-logloss:0.189665
[23]	validation_0-logloss:0.186896
[24]	validation_0-logloss:0.18449
[25]	validation_0-logloss:0.18238
[26]	validation_0-logloss:0.180546

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=1000,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [465]:
y_pred_xgb = xgb_model.predict(X_test)

In [466]:
calculate_scores(y_test, y_pred_xgb)

(0.9518181818181818, 0.4501541623843782, 0.8126159554730983)

#### LightGBM model

In [467]:
lightgbm_model = lightgbm.LGBMClassifier(
 learning_rate = 0.1,
 n_estimators = 1000,
 scale_pos_weight = 2)
eval_set = [(X_test, y_test)]
lightgbm_model.fit(X_train, y_train, early_stopping_rounds=100, eval_set=eval_set, eval_metric="logloss", verbose=True)

[1]	valid_0's binary_logloss: 0.225645	valid_0's binary_logloss: 0.225645
Training until validation scores don't improve for 100 rounds.
[2]	valid_0's binary_logloss: 0.213702	valid_0's binary_logloss: 0.213702
[3]	valid_0's binary_logloss: 0.206143	valid_0's binary_logloss: 0.206143
[4]	valid_0's binary_logloss: 0.200383	valid_0's binary_logloss: 0.200383
[5]	valid_0's binary_logloss: 0.19612	valid_0's binary_logloss: 0.19612
[6]	valid_0's binary_logloss: 0.192597	valid_0's binary_logloss: 0.192597
[7]	valid_0's binary_logloss: 0.189925	valid_0's binary_logloss: 0.189925
[8]	valid_0's binary_logloss: 0.187549	valid_0's binary_logloss: 0.187549
[9]	valid_0's binary_logloss: 0.185763	valid_0's binary_logloss: 0.185763
[10]	valid_0's binary_logloss: 0.184209	valid_0's binary_logloss: 0.184209
[11]	valid_0's binary_logloss: 0.182831	valid_0's binary_logloss: 0.182831
[12]	valid_0's binary_logloss: 0.181548	valid_0's binary_logloss: 0.181548
[13]	valid_0's binary_logloss: 0.180728	valid_0'

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
        importance_type='split', learning_rate=0.1, max_depth=-1,
        min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
        n_estimators=1000, n_jobs=-1, num_leaves=31, objective=None,
        random_state=None, reg_alpha=0.0, reg_lambda=0.0,
        scale_pos_weight=2, silent=True, subsample=1.0,
        subsample_for_bin=200000, subsample_freq=0)

In [468]:
y_pred_lightgbm = lightgbm_model.predict(X_test)

In [469]:
calculate_scores(y_test, y_pred_lightgbm)

(0.951590909090909, 0.4635149023638232, 0.7940140845070423)

In [358]:
lightgbm_model.get_params()

{'boosting_type': 'gbdt',
 'class_weight': None,
 'colsample_bytree': 1.0,
 'importance_type': 'split',
 'learning_rate': 0.1,
 'max_depth': -1,
 'min_child_samples': 20,
 'min_child_weight': 0.001,
 'min_split_gain': 0.0,
 'n_estimators': 1000,
 'n_jobs': -1,
 'num_leaves': 31,
 'objective': None,
 'random_state': None,
 'reg_alpha': 0.0,
 'reg_lambda': 0.0,
 'silent': True,
 'subsample': 1.0,
 'subsample_for_bin': 200000,
 'subsample_freq': 0,
 'scale_pos_weight': 2}