In [98]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import svm
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix

In [99]:
def normalize(df):
    return (df-df.mean())/df.std()

In [100]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

train.drop(columns=['5'], inplace=True)
test.drop(columns=['5'], inplace=True)

num_cols = list(train.columns[:-1])
train[num_cols] = normalize(train[num_cols])
test[num_cols] = normalize(test[num_cols])

np.shape(train)

(6963, 31)

In [101]:
np.shape(test)

(3920, 30)

In [103]:
X_train, X_val, y_train, y_val = train_test_split(train[num_cols], train.iloc[:, -1], test_size=0.3, random_state=0)
X_test = test[num_cols]

### SVM 

In [104]:
model = svm.SVC()
model = model.fit(X_train, y_train)
y_val_pred = model.predict(X_val)

f1_score(y_val, y_val_pred), precision_score(y_val, y_val_pred), recall_score(y_val, y_val_pred)

(0.8801431127012522, 0.9479768786127167, 0.8213689482470785)

In [105]:
confusion_matrix(y_val, y_val_pred)

array([[1463,   27],
       [ 107,  492]], dtype=int64)

In [106]:
X = pd.concat([X_train, X_val])
y = pd.concat([y_train, y_val])
model.fit(X, y)

In [107]:
answer = pd.DataFrame()
answer['target'] = model.predict(X_test)

In [108]:
answer.target.value_counts()

0    2317
1    1603
Name: target, dtype: int64

### Oversampling

In [109]:
# separate minority and majority class
from sklearn.utils import resample

train_data = pd.concat([X_train, y_train], axis=1)

zero_class = train_data[train_data['target']==0] 
one_class = train_data[train_data['target']==1]  # minority
print(len(zero_class), len(one_class))

one_class_upsampled = resample(one_class, replace=True, 
                              n_samples=len(zero_class),
                              random_state=27)
print(len(zero_class), len(one_class_upsampled))

3473 1401
3473 3473


In [110]:
# combine majority and upsampled minority
upsampled = pd.concat([zero_class, one_class_upsampled])

X_train_up = upsampled.copy()
X_train_up.drop(columns='target', inplace=True)
y_train_up = upsampled['target']

model = svm.SVC() ### подобранные параметры kernel='rbf', C=1.33)
model = model.fit(X_train_up, y_train_up)
y_val_pred = model.predict(X_val)

f1_score(y_val, y_val_pred), precision_score(y_val, y_val_pred), recall_score(y_val, y_val_pred)

(0.8995057660626029, 0.8878048780487805, 0.9115191986644408)

In [111]:
confusion_matrix(y_val, y_val_pred)

array([[1421,   69],
       [  53,  546]], dtype=int64)

In [112]:
X = pd.concat([X_train, X_val])
y = pd.concat([y_train, y_val])

In [113]:
answer = pd.DataFrame()
answer['target'] = model.predict(X_test)
answer.to_csv('svc_all_normal_sub.csv', index=False)

In [114]:
answer.target.value_counts()

0    2119
1    1801
Name: target, dtype: int64

### Данное решение выше было загружено на All_Cups