Load data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
sns.set(font_scale=1.5)

import warnings
warnings.filterwarnings("ignore")
%matplotlib inline

Preprocess

In [None]:
cols = ["Nomer", "Sended","SendedSum", "Received","ReceivedSum", "UAddresses","UAddressesSum" ,"HCop", "HCopSum", "Cop", "CopSum",  "MesAftX", "MesAftXSum" ,"DaysBet", "DaysBetSr", "RespToLett","RespToLettSum", "Symb", "SymbSum" ,"MesAftW","MesAftWSum" ,"GetSend","GetSendSr" ,"GSByte","GSByteSr" ,"QuestMarks", "QuestMarksSum", "Charachteristic", "Razmetka"   ]
df = pd.read_csv("/content/drive/MyDrive/Original_TXT.txt", sep='\t',  names=cols, skiprows=1, )
df['GetSendSr'] = df['GetSendSr'].str.replace(',', '.')
df['GSByteSr'] = df['GSByteSr'].str.replace(',', '.')
df['DaysBetSr'] = df['DaysBetSr'].str.replace(',', '.')
df.head()

In [None]:
# df['Test'] = df['Name'].str.extract(r',\s([^.]+)\.')
# df['Test'].unique()
# df.groupby('Test').size().sort_values()

Let 's take only where there is more than 1 record

In [None]:
# def preprocess(df):
#   #Change name
#   df['Name'] = df['Name'].str.extract(r',\s([^.]+)\.')
#   for index, name in enumerate(df['Name']):
#     if name not in ['Major', 'Mlle', 'Col', 'Rev', 'Dr', 'Master', 'Mrs', 'Miss', 'Mr']:
#         if df.loc[index, 'Sex'] == 'female':
#             df.loc[index, 'Name'] = 'Mrs'
#         else:
#             df.loc[index, 'Name'] = 'Mr'

#   #Get only TiketPrefix
#   df['Ticket'] = df['Ticket'].str.extract(r'(\D+)(\d+)')[0]
#   df['Ticket'] = df['Ticket'].str.replace(r'[^\w\s]', '').str.replace(' ', '').str[0]

#   #Get only first char in cabin Name
#   df['Cabin'] = df['Cabin'].str[0]
#   return df

In [None]:
# df = preprocess(df)

In [None]:
# df.head()

Next step to create Pipline for learning and fit

In [None]:
numeric_cols = [
    'SendedSum',
    'ReceivedSum',
    'UAddressesSum',
    'HCopSum',
    'CopSum',
    'MesAftXSum',
    'DaysBetSr',
    'RespToLettSum',
    'SymbSum',
    'MesAftWSum',
    'GetSendSr',
    'GSByteSr',
    'QuestMarksSum'
]

target_col = 'Razmetka'

In [None]:
!pip install catboost

In [None]:
np.linspace(0.1, 0.4, 6)

array([0.1 , 0.16, 0.22, 0.28, 0.34, 0.4 ])

In [None]:
from catboost import CatBoostClassifier
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

preprocessor = ColumnTransformer(
    transformers=[
        ('num', make_pipeline(SimpleImputer(strategy='mean'), StandardScaler(), MinMaxScaler()), numeric_cols)
    ]
)

rf = RandomForestClassifier()
gb = CatBoostClassifier()
lor = LogisticRegression()
kNN = KNeighborsClassifier()

ensemble = VotingClassifier(estimators=[('rf', rf), ('gb', gb), ('lor', lor), ('kNN', kNN)], voting='soft')
#ensemble = VotingClassifier(estimators=[('rf', rf), ('gb', gb), ('lor', lor)], voting='soft')
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('ensemble', ensemble)
])

param_grid = {
    'ensemble__rf__n_estimators': [50, 100, 200],
    'ensemble__gb__n_estimators': [50, 100, 200],
    'ensemble__gb__learning_rate': np.linspace(0.1, 0.4, 6),
    'ensemble__lor__C': [0.01, 0.1, 1, 10],
    'ensemble__kNN__n_neighbors': [3, 4, 5, 6]
}

model = GridSearchCV(pipeline, param_grid=param_grid, scoring='roc_auc', cv=5)


In [None]:
model

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df[numeric_cols], df[target_col], train_size=0.8)

In [None]:
model.fit(X_train, y_train)

[1;30;43mВыходные данные были обрезаны до нескольких последних строк (5000).[0m
0:	learn: 0.2280828	total: 3.14ms	remaining: 624ms
1:	learn: 0.1295947	total: 6.75ms	remaining: 668ms
2:	learn: 0.1061432	total: 9.95ms	remaining: 654ms
3:	learn: 0.0880123	total: 13.2ms	remaining: 647ms
4:	learn: 0.0776272	total: 16.3ms	remaining: 635ms
5:	learn: 0.0724806	total: 19.4ms	remaining: 627ms
6:	learn: 0.0685703	total: 22.4ms	remaining: 618ms
7:	learn: 0.0584792	total: 25.5ms	remaining: 612ms
8:	learn: 0.0548247	total: 29.1ms	remaining: 617ms
9:	learn: 0.0474556	total: 32.2ms	remaining: 612ms
10:	learn: 0.0427114	total: 35.4ms	remaining: 609ms
11:	learn: 0.0391854	total: 38.5ms	remaining: 604ms
12:	learn: 0.0336360	total: 41.7ms	remaining: 600ms
13:	learn: 0.0307693	total: 44.8ms	remaining: 595ms
14:	learn: 0.0282180	total: 47.9ms	remaining: 591ms
15:	learn: 0.0256581	total: 51.1ms	remaining: 588ms
16:	learn: 0.0218104	total: 54.3ms	remaining: 584ms
17:	learn: 0.0212048	total: 57.3ms	remaining

In [None]:
# idx = X_test[X_test['Parch'] == 6].index
# X_test.loc[idx, 'Parch'] = 5

In [1]:
# from sklearn.metrics import classification_report, roc_auc_score, roc_curve
# y_train_predicted = model.predict(X_train)
# y_test_predicted = model.predict(X_test)
# train_auc = roc_auc_score(y_train, y_train_predicted)
# test_auc = roc_auc_score(y_test, y_test_predicted)

# plt.figure(figsize=(10,7))
# plt.plot(*roc_curve(y_train, y_train_predicted)[:2], label='train AUC={:.4f}'.format(train_auc))
# plt.plot(*roc_curve(y_test, y_test_predicted)[:2], label='test AUC={:.4f}'.format(test_auc))
# legend_box = plt.legend(fontsize='large', framealpha=1).get_frame()
# legend_box.set_facecolor("white")
# legend_box.set_edgecolor("black")
# plt.plot(np.linspace(0,1,100), np.linspace(0,1,100))
# plt.show()


In [None]:
probabilities = model.predict_proba(X_test)
print(probabilities)

[[0.98771616 0.01228384]
 [0.99469276 0.00530724]
 [0.96219783 0.03780217]
 [0.99438289 0.00561711]
 [0.98822669 0.01177331]
 [0.9832913  0.0167087 ]
 [0.99777797 0.00222203]
 [0.98120378 0.01879622]
 [0.97608641 0.02391359]
 [0.99828066 0.00171934]
 [0.99744897 0.00255103]
 [0.98327571 0.01672429]
 [0.98944256 0.01055744]
 [0.9945066  0.0054934 ]
 [0.99753339 0.00246661]
 [0.97706325 0.02293675]
 [0.97983238 0.02016762]
 [0.99430013 0.00569987]
 [0.98611655 0.01388345]
 [0.99725636 0.00274364]
 [0.99002577 0.00997423]
 [0.99665725 0.00334275]
 [0.99764318 0.00235682]
 [0.99320588 0.00679412]
 [0.9846584  0.0153416 ]
 [0.98470709 0.01529291]
 [0.99039083 0.00960917]
 [0.99800832 0.00199168]
 [0.9699916  0.0300084 ]
 [0.99488361 0.00511639]
 [0.99563967 0.00436033]
 [0.96753437 0.03246563]
 [0.98848089 0.01151911]
 [0.98576804 0.01423196]
 [0.98953756 0.01046244]
 [0.99116969 0.00883031]
 [0.98393879 0.01606121]
 [0.99253422 0.00746578]
 [0.98855306 0.01144694]
 [0.98803505 0.01196495]


In [None]:
accuracy = model.score(X_test, y_test)

print(f'Точность модели: {accuracy}')

Точность модели: 0.6421232876712328


In [None]:
print(X_test)

     SendedSum  ReceivedSum  UAddressesSum  HCopSum  CopSum  MesAftXSum  \
653        250          488             69       45      34         488   
457        245          539            173      167      95         539   
169        223          507             34       41      52         507   
336        270          545            195       96      85         545   
162        215          324             57       74      62         324   
..         ...          ...            ...      ...     ...         ...   
503        190          566            105      151      83         566   
5          203          526             93       34     140         526   
106        216          635            184       98     154         635   
322        198          549            172      116      72         549   
4          318          982            193      266     250         982   

    DaysBetSr  RespToLettSum  SymbSum  MesAftWSum GetSendSr GSByteSr  \
653         3            48