In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report, accuracy_score, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [2]:
def df_one_hot(df, column):
        one_hot = pd.get_dummies(df[column], dtype=bool)
        df = df.drop(column, axis=1)
        df = df.join(one_hot)
        return df

def df_normalize(df, column):
    max_value = df[column].max()
    min_value = df[column].min()
    df[column] = (df[column] - min_value) / (max_value - min_value)
    return df

In [3]:
df = pd.read_csv("../datasets/transactions.csv")

df_labels = pd.read_csv("../datasets/train.csv")
df = df.merge(df_labels, left_on='client_id', right_on='client_id')


df = df_one_hot(df, "trans_city")
mcc_code_cl = list(df["mcc_code"].unique())
df = df_one_hot(df, "mcc_code")

drop = ["term_id", "Unnamed: 0"]
df = df.drop(drop, axis=1)

df["big_transaction+"] = df[df["amount"] >= 100000]["amount"]
df["big_transaction+"] = df["big_transaction+"].notna()

df["big_transaction-"] = df[df["amount"] <= -100000]["amount"]
df["big_transaction-"] = df["big_transaction-"].notna()

df["amount+"] = df["amount"].apply(lambda x: x if x>0 else 0)
df["amount-"] = df["amount"].apply(lambda x: x if x<0 else 0)

agg_func_describe = {'Kaliningrad': ['mean'],
                     'Kazan': ['mean'], 
                     'Khabarovsk': ['mean'], 
                     'Moscow': ['mean'], 
                     'Novosibirsk': ['mean'], 
                     'Penza': ['mean'], 
                     'Saint Petersburg': ['mean'], 
                     'Tver': ['mean'], 
                     'Vladimir': ['mean'], 
                     'Vladivostok': ['mean'],
                     "big_transaction-": ["sum"],
                     "big_transaction+": ["sum"],
                     "gender":["mean"],
                     "amount":["mean", "sum", "median"],
                     "amount+":["mean", "sum"],
                     "amount-":["mean", "sum"],
                     }

for cl in mcc_code_cl:
    agg_func_describe[cl] = "sum"

df = df.groupby(['client_id']).agg(agg_func_describe).round(4)

norm = ["amount", "amount+", "amount-"] + mcc_code_cl
for cl in norm:
    df = df_normalize(df, cl)

target = df['gender']
X = df.drop(['gender'], axis=1)

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, target, stratify=target, random_state=179)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, stratify=y_train, random_state=179)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
X_val = scaler.transform(X_val)

model = LogisticRegression()
model.fit(X_train, y_train)

y_pred_val = model.predict(X_val)
print('Итоговый f1_score на валидации: ', f1_score(y_val, y_pred_val))
print('Итоговый accuracy на валидации: ', accuracy_score(y_val, y_pred_val))
print('Итоговый ROC-accuracy на валидации: ', roc_auc_score(y_val, y_pred_val))

# y_pred_test = model.predict(X_test)
# print('Итоговый f1_score на тесте: ', f1_score(y_test, y_pred_test))
# print('Итоговый accuracy на тесте: ', accuracy_score(y_test, y_pred_test))

Итоговый f1_score на валидации:  0.70147954743255
Итоговый accuracy на валидации:  0.7581100141043724


  y = column_or_1d(y, warn=True)


In [7]:
print('Итоговый ROC-accuracy на валидации: ', roc_auc_score(y_val, y_pred_val))

Итоговый ROC-accuracy на валидации:  0.7462372089275642
