# Preprocessing data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
app_info_df = pd.read_csv("application_info.csv")
dflt_flg_df = pd.read_csv("default_flg.csv")

In [3]:
app_info_df.head()

Unnamed: 0,id,application_dt,sample_cd,education_cd,gender_cd,age,car_own_flg,car_type_flg,appl_rej_cnt,good_work_flg,Score_bki,out_request_cnt,region_rating,home_address_cd,work_address_cd,income,SNA,first_time_cd,Air_flg
0,1,01JAN2014,train,SCH,M,27,Y,Y,0,0,-1.917831,0,40,2,3,32000,1,4,N
1,2,01JAN2014,train,GRD,F,26,N,N,0,0,-1.153144,2,60,2,3,50000,2,1,N
2,3,01JAN2014,train,SCH,M,35,N,N,0,1,-1.73281,0,40,1,2,20000,4,3,N
3,4,01JAN2014,train,GRD,F,35,N,N,0,1,-2.552133,2,20,2,3,80000,1,3,N
4,5,01JAN2014,train,UGR,F,24,N,N,0,0,-1.914581,1,50,2,3,27000,1,2,N


In [4]:
dflt_flg_df.head()

Unnamed: 0,id,default_flg
0,1,0
1,2,0
2,3,0
3,4,0
4,5,0


In [5]:
print(app_info_df.shape)
print(dflt_flg_df.shape)

(205296, 19)
(180816, 2)


In [6]:
print("Дубликатов в датафрейме: ", app_info_df.duplicated().sum())
print("Уникальные значения в 'sample_cd': ", app_info_df["sample_cd"].unique())

Дубликатов в датафрейме:  0
Уникальные значения в 'sample_cd':  ['train' 'validate' 'test']


In [7]:
merge_df = pd.merge(app_info_df, dflt_flg_df, how='inner')

In [8]:
merge_df.shape

(180816, 20)

In [9]:
merge_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 180816 entries, 0 to 180815
Data columns (total 20 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   id               180816 non-null  int64  
 1   application_dt   180816 non-null  object 
 2   sample_cd        180816 non-null  object 
 3   education_cd     180093 non-null  object 
 4   gender_cd        180816 non-null  object 
 5   age              180816 non-null  int64  
 6   car_own_flg      180816 non-null  object 
 7   car_type_flg     180816 non-null  object 
 8   appl_rej_cnt     180816 non-null  int64  
 9   good_work_flg    180816 non-null  int64  
 10  Score_bki        180816 non-null  float64
 11  out_request_cnt  180816 non-null  int64  
 12  region_rating    180816 non-null  int64  
 13  home_address_cd  180816 non-null  int64  
 14  work_address_cd  180816 non-null  int64  
 15  income           180816 non-null  int64  
 16  SNA              180816 non-null  int6

In [10]:
merge_df.describe(include="all")

Unnamed: 0,id,application_dt,sample_cd,education_cd,gender_cd,age,car_own_flg,car_type_flg,appl_rej_cnt,good_work_flg,Score_bki,out_request_cnt,region_rating,home_address_cd,work_address_cd,income,SNA,first_time_cd,Air_flg,default_flg
count,180816.0,180816,180816,180093,180816,180816.0,180816,180816,180816.0,180816.0,180816.0,180816.0,180816.0,180816.0,180816.0,180816.0,180816.0,180816.0,180816,180816.0
unique,,243,2,5,2,,2,2,,,,,,,,,,,2,
top,,18MAR2014,train,SCH,F,,N,N,,,,,,,,,,,N,
freq,,1491,110148,89919,103990,,124511,148914,,,,,,,,,,,152038,
mean,90408.5,,,,,39.326254,,,0.279466,0.16535,-1.948117,2.066747,57.073047,1.662032,2.597171,40856.240477,1.735256,2.775385,,0.115222
std,52197.227474,,,,,11.528616,,,0.780688,0.371498,0.490409,2.176197,13.177174,0.500779,0.65007,43219.279694,1.123841,1.022766,,0.319291
min,1.0,,,,,21.0,,,0.0,0.0,-3.624586,0.0,20.0,1.0,1.0,1000.0,1.0,1.0,,0.0
25%,45204.75,,,,,30.0,,,0.0,0.0,-2.29486,1.0,50.0,1.0,2.0,20000.0,1.0,2.0,,0.0
50%,90408.5,,,,,37.0,,,0.0,0.0,-1.969554,2.0,50.0,2.0,3.0,30000.0,1.0,3.0,,0.0
75%,135612.25,,,,,48.0,,,0.0,0.0,-1.619331,3.0,70.0,2.0,3.0,49000.0,2.0,4.0,,0.0


## Замена пропусков в education_cd

In [11]:
print("Количество пустых значений в education_cd: ", merge_df.education_cd.isna().sum())

Количество пустых значений в education_cd:  723


По-сравнению с общим числом объектов, 723 - небольшое количество, а значит можем заменить пустые значения модой

In [12]:
fill_education = merge_df.copy(deep=True)

In [13]:
fill_education["education_cd"] = fill_education["education_cd"].fillna(fill_education["education_cd"].mode()[0])

In [14]:
print("Количество пропусков: ", fill_education["education_cd"].isna().sum())

Количество пропусков:  0


In [15]:
fill_education["education_cd"].unique()

array(['SCH', 'GRD', 'UGR', 'PGR', 'ACD'], dtype=object)

## Преобразование категориальных фичей в числовые через one-hot encoding

In [16]:
fill_education.head()

Unnamed: 0,id,application_dt,sample_cd,education_cd,gender_cd,age,car_own_flg,car_type_flg,appl_rej_cnt,good_work_flg,Score_bki,out_request_cnt,region_rating,home_address_cd,work_address_cd,income,SNA,first_time_cd,Air_flg,default_flg
0,1,01JAN2014,train,SCH,M,27,Y,Y,0,0,-1.917831,0,40,2,3,32000,1,4,N,0
1,2,01JAN2014,train,GRD,F,26,N,N,0,0,-1.153144,2,60,2,3,50000,2,1,N,0
2,3,01JAN2014,train,SCH,M,35,N,N,0,1,-1.73281,0,40,1,2,20000,4,3,N,0
3,4,01JAN2014,train,GRD,F,35,N,N,0,1,-2.552133,2,20,2,3,80000,1,3,N,0
4,5,01JAN2014,train,UGR,F,24,N,N,0,0,-1.914581,1,50,2,3,27000,1,2,N,0


In [17]:
category_columns = ["education_cd", "gender_cd", "car_own_flg", "car_type_flg", "Air_flg"]

In [18]:
data_category = pd.get_dummies(fill_education[category_columns], drop_first=True).astype('int64')
data_category.head()

Unnamed: 0,education_cd_GRD,education_cd_PGR,education_cd_SCH,education_cd_UGR,gender_cd_M,car_own_flg_Y,car_type_flg_Y,Air_flg_Y
0,0,0,1,0,1,1,1,0
1,1,0,0,0,0,0,0,0
2,0,0,1,0,1,0,0,0
3,1,0,0,0,0,0,0,0
4,0,0,0,1,0,0,0,0


In [19]:
cat_df = fill_education.copy(deep=True)

In [20]:
cat_df.drop(category_columns, axis=1, inplace=True)
cat_df = pd.concat([cat_df, data_category], axis=1)
cat_df.head()

Unnamed: 0,id,application_dt,sample_cd,age,appl_rej_cnt,good_work_flg,Score_bki,out_request_cnt,region_rating,home_address_cd,...,first_time_cd,default_flg,education_cd_GRD,education_cd_PGR,education_cd_SCH,education_cd_UGR,gender_cd_M,car_own_flg_Y,car_type_flg_Y,Air_flg_Y
0,1,01JAN2014,train,27,0,0,-1.917831,0,40,2,...,4,0,0,0,1,0,1,1,1,0
1,2,01JAN2014,train,26,0,0,-1.153144,2,60,2,...,1,0,1,0,0,0,0,0,0,0
2,3,01JAN2014,train,35,0,1,-1.73281,0,40,1,...,3,0,0,0,1,0,1,0,0,0
3,4,01JAN2014,train,35,0,1,-2.552133,2,20,2,...,3,0,1,0,0,0,0,0,0,0
4,5,01JAN2014,train,24,0,0,-1.914581,1,50,2,...,2,0,0,0,0,1,0,0,0,0


## Обработка выбросов в appl_rej_cnt и out_request_cnt

In [21]:
extreme_values = cat_df.copy(deep=True)

In [22]:
def out_request_cnt_flag(str_):
    if str_ > 10:
        return 1
    else:
        return 0

def appl_rej_cnt_flag(str_):
    if str_ > 5:
        return 1
    else:
        return 0

In [23]:
extreme_values["out_request_cnt>10"] = extreme_values["out_request_cnt"].apply(out_request_cnt_flag)

In [24]:
extreme_values["appl_rej_cnt>5"] = extreme_values["appl_rej_cnt"].apply(appl_rej_cnt_flag)

In [25]:
extreme_values = extreme_values.drop(columns=["out_request_cnt", "appl_rej_cnt"])

## Преобразование времени

In [27]:
def create_final_temporal_features(df, date_col='application_dt'):
    """
    Создает оптимальные временные признаки
    """
    df_temp = df.copy()
    
    # Конвертируем дату
    df_temp[date_col] = pd.to_datetime(df_temp[date_col], format='%d%b%Y')
    
    # СЕЗОНЫ
    def get_season(month):
        if month in [12, 1, 2]:
            return 'winter'
        elif month in [3, 4, 5]:
            return 'spring'
        elif month in [6, 7, 8]:
            return 'summer'
        else:  # 9, 10, 11
            return 'autumn'
    
    df_temp['month'] = df_temp[date_col].dt.month
    df_temp['season'] = df_temp['month'].apply(get_season)
    
    # МЕСЯЦ В СЕЗОНЕ (1-3)
    def month_in_season(month):
        if month in [12, 1, 2]:
            return [12, 1, 2].index(month) + 1
        elif month in [3, 4, 5]:
            return [3, 4, 5].index(month) + 1
        elif month in [6, 7, 8]:
            return [6, 7, 8].index(month) + 1
        else:
            return [9, 10, 11].index(month) + 1
    
    df_temp['month_in_season'] = df_temp['month'].apply(month_in_season)
    
    # ЧЕТВЕРТЬ МЕСЯЦА (1-4)
    def quarter_of_month(day):
        if day <= 7:
            return 1
        elif day <= 14:
            return 2
        elif day <= 21:
            return 3
        else:
            return 4
    
    df_temp['month_quarter'] = df_temp[date_col].dt.day.apply(quarter_of_month)
    
    # 4. ДЕНЬ НЕДЕЛИ (0-6, где 0=понедельник)
    df_temp['day_of_week'] = df_temp[date_col].dt.dayofweek
    
    # БУЛЕВЫЕ ПРИЗНАКИ
    # Праздники (Новый год, майские)
    def is_holiday(date):
        month, day = date.month, date.day
        # Новогодние праздники (25 дек - 10 янв)
        if (month == 12 and day >= 25) or (month == 1 and day <= 10):
            return 1
        # Майские праздники (1-10 мая)
        elif month == 5 and day <= 10:
            return 1
        else:
            return 0
    
    df_temp['is_holiday'] = df_temp[date_col].apply(is_holiday)
    
    # Конец/начало месяца
    df_temp['is_month_start'] = df_temp[date_col].dt.is_month_start.astype(int)
    df_temp['is_month_end'] = df_temp[date_col].dt.is_month_end.astype(int)
    
    # Рабочий день (пн-пт)
    df_temp['is_weekday'] = (df_temp['day_of_week'] < 5).astype(int)
    
    # Удаляем вспомогательные колонки
    df_temp = df_temp.drop(['month', date_col], axis=1)
    
    return df_temp

In [28]:
df_with_time = create_final_temporal_features(extreme_values)
df_with_time.head()

Unnamed: 0,id,sample_cd,age,good_work_flg,Score_bki,region_rating,home_address_cd,work_address_cd,income,SNA,...,out_request_cnt>10,appl_rej_cnt>5,season,month_in_season,month_quarter,day_of_week,is_holiday,is_month_start,is_month_end,is_weekday
0,1,train,27,0,-1.917831,40,2,3,32000,1,...,0,0,winter,2,1,2,1,1,0,1
1,2,train,26,0,-1.153144,60,2,3,50000,2,...,0,0,winter,2,1,2,1,1,0,1
2,3,train,35,1,-1.73281,40,1,2,20000,4,...,0,0,winter,2,1,2,1,1,0,1
3,4,train,35,1,-2.552133,20,2,3,80000,1,...,0,0,winter,2,1,2,1,1,0,1
4,5,train,24,0,-1.914581,50,2,3,27000,1,...,0,0,winter,2,1,2,1,1,0,1


In [29]:
df_with_time.season.unique()

array(['winter', 'spring', 'summer'], dtype=object)

In [30]:
season_column = ["season"]

In [31]:
season_category = pd.get_dummies(df_with_time[season_column]).astype('int64')
season_category.head()

Unnamed: 0,season_spring,season_summer,season_winter
0,0,0,1
1,0,0,1
2,0,0,1
3,0,0,1
4,0,0,1


In [32]:
df = df_with_time.copy(deep=True)

In [33]:
df.drop(season_column, axis=1, inplace=True)
df = pd.concat([df, season_category], axis=1)
df.head()

Unnamed: 0,id,sample_cd,age,good_work_flg,Score_bki,region_rating,home_address_cd,work_address_cd,income,SNA,...,month_in_season,month_quarter,day_of_week,is_holiday,is_month_start,is_month_end,is_weekday,season_spring,season_summer,season_winter
0,1,train,27,0,-1.917831,40,2,3,32000,1,...,2,1,2,1,1,0,1,0,0,1
1,2,train,26,0,-1.153144,60,2,3,50000,2,...,2,1,2,1,1,0,1,0,0,1
2,3,train,35,1,-1.73281,40,1,2,20000,4,...,2,1,2,1,1,0,1,0,0,1
3,4,train,35,1,-2.552133,20,2,3,80000,1,...,2,1,2,1,1,0,1,0,0,1
4,5,train,24,0,-1.914581,50,2,3,27000,1,...,2,1,2,1,1,0,1,0,0,1


In [34]:
df.sample_cd.unique()

array(['train', 'validate'], dtype=object)

In [35]:
df.default_flg.value_counts()

default_flg
0    159982
1     20834
Name: count, dtype: int64

# Select and train model

In [36]:
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score

In [37]:
from lightgbm import LGBMClassifier

In [38]:
from sklearn.metrics import classification_report, roc_auc_score, precision_recall_curve

In [39]:
import optuna

In [40]:
df = df.drop(columns=["sample_cd"])

In [41]:
df = df.drop(columns=["id"])

In [42]:
X, y = df.drop(columns=["default_flg"]), df.default_flg

In [43]:
type(df)

pandas.core.frame.DataFrame

In [44]:
X_train, X_val_test, y_train, y_val_test = train_test_split(
    X, y, test_size=0.40, random_state=42, shuffle=True)

In [45]:
X_train.shape

(108489, 29)

In [46]:
X_val_test.shape

(72327, 29)

In [47]:
X_val, y_val = X_val_test.iloc[22032:], y_val_test.iloc[22032:]
X_test, y_test = X_val_test.iloc[0:22031], y_val_test.iloc[0:22031]

In [48]:
print("X_val: ", X_val.shape)
print("y_val: ", y_val.shape)
print()
print("X_test: ", X_test.shape)
print("y_test: ", y_test.shape)

X_val:  (50295, 29)
y_val:  (50295,)

X_test:  (22031, 29)
y_test:  (22031,)


In [49]:
class_weights = {0: 1.0, 1: 8}

In [50]:
lgb_model = LGBMClassifier(
)

In [51]:
lgb_model.fit(X_train, y_train)

[LightGBM] [Info] Number of positive: 12519, number of negative: 95970
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005860 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 596
[LightGBM] [Info] Number of data points in the train set: 108489, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.115394 -> initscore=-2.036788
[LightGBM] [Info] Start training from score -2.036788


0,1,2
,boosting_type,'gbdt'
,num_leaves,31
,max_depth,-1
,learning_rate,0.1
,n_estimators,100
,subsample_for_bin,200000
,objective,
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [52]:
y_pred = lgb_model.predict(X_val)

In [53]:
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.89      1.00      0.94     44516
           1       0.51      0.01      0.02      5779

    accuracy                           0.89     50295
   macro avg       0.70      0.50      0.48     50295
weighted avg       0.84      0.89      0.83     50295



In [56]:
val_test_score = roc_auc_score(y_val, lgb_model.predict_proba(X_val)[:, 1])
print(f"Test AUC: {val_test_score:.3f}")

Test AUC: 0.726


## Подбор параметров через optuna

In [57]:
def objective_sklearn(trial):
    """
    Функция для оптимизации гиперпараметров LightGBM через Optuna
    с учетом особенностей LightGBM и дисбаланса классов
    """
    
    params = {
        # ОСНОВНЫЕ ПАРАМЕТРЫ
        'objective': 'binary',  
        'boosting_type': 'gbdt',
        'verbose': -1, 
        'random_state': 42,
        
        # ОПТИМИЗИРУЕМЫЕ ПАРАМЕТРЫ
        'n_estimators': trial.suggest_int('n_estimators', 100, 500),
        'num_leaves': trial.suggest_int('num_leaves', 20, 40),
        'max_depth': trial.suggest_int('max_depth', 3, 8),
        'min_child_samples': trial.suggest_int('min_child_samples', 10, 50),
        
        # ДИСБАЛАНС КЛАССОВ
        'scale_pos_weight': trial.suggest_float('scale_pos_weight', 5, 15),
        
        # РЕГУЛЯРИЗАЦИЯ (чтобы избежать переобучения)
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-3, 10.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-3, 10.0, log=True),
        
        # СКОРОСТЬ ОБУЧЕНИЯ
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2, log=True),
        
        # СЭМПЛИРОВАНИЕ (борьба с переобучением)
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'subsample_freq': trial.suggest_int('subsample_freq', 1, 10),
        
        'n_jobs': 1,
    }
    

    model = LGBMClassifier(**params)
    
    cv = StratifiedKFold(
        n_splits=3, 
        shuffle=True, 
        random_state=42
    )
    
    scores = cross_val_score(
        model, 
        X_train, 
        y_train,
        cv=cv, 
        scoring='roc_auc',  
        n_jobs=1,  
        verbose=0,
        error_score='raise'
    )
    
    return np.mean(scores)

In [58]:
study = optuna.create_study(
    direction='maximize',  
    study_name='lightgbm_credit_scoring',
    sampler=optuna.samplers.TPESampler(seed=42),  
    pruner=optuna.pruners.MedianPruner( 
        n_warmup_steps=10,
        n_min_trials=5
    )
)

[I 2025-12-09 16:18:06,689] A new study created in memory with name: lightgbm_credit_scoring


In [111]:
# Запускаем оптимизацию
print("Начинаем оптимизацию гиперпараметров...")
study.optimize(
    objective_sklearn, 
    n_trials=30,  # Количество итераций
    show_progress_bar=True,  # Прогресс-бар
    gc_after_trial=True  # Очистка памяти после каждого trial
)

# Выводим результаты
print("\n" + "="*50)
print("ОПТИМИЗАЦИЯ ЗАВЕРШЕНА")
print("="*50)
print(f"Лучший ROC-AUC: {study.best_value:.4f}")
print(f"Количество trials: {len(study.trials)}")
print("\nЛучшие параметры:")

for key, value in study.best_params.items():
    print(f"  {key}: {value}")

Начинаем оптимизацию гиперпараметров...


  0%|          | 0/30 [00:00<?, ?it/s]

[I 2025-12-06 17:47:55,602] Trial 0 finished with value: 0.7136014851673659 and parameters: {'n_estimators': 250, 'num_leaves': 39, 'max_depth': 7, 'min_child_samples': 34, 'scale_pos_weight': 6.560186404424365, 'reg_alpha': 0.004207053950287938, 'reg_lambda': 0.0017073967431528124, 'learning_rate': 0.13394334706750485, 'subsample': 0.8404460046972835, 'colsample_bytree': 0.8832290311184181, 'subsample_freq': 1}. Best is trial 0 with value: 0.7136014851673659.
[I 2025-12-06 17:48:14,982] Trial 1 finished with value: 0.73103757686878 and parameters: {'n_estimators': 488, 'num_leaves': 37, 'max_depth': 4, 'min_child_samples': 17, 'scale_pos_weight': 6.834045098534338, 'reg_alpha': 0.016480446427978974, 'reg_lambda': 0.12561043700013558, 'learning_rate': 0.03647316284911211, 'subsample': 0.7164916560792167, 'colsample_bytree': 0.8447411578889518, 'subsample_freq': 2}. Best is trial 1 with value: 0.73103757686878.
[I 2025-12-06 17:48:23,844] Trial 2 finished with value: 0.7269741902280401 

In [112]:
print(dict(study.best_params.items()))

{'n_estimators': 406, 'num_leaves': 40, 'max_depth': 3, 'min_child_samples': 15, 'scale_pos_weight': 11.135712739455382, 'reg_alpha': 0.0014341754109660826, 'reg_lambda': 0.5210082757225214, 'learning_rate': 0.047127077338180494, 'subsample': 0.8230729607127126, 'colsample_bytree': 0.6525305504945635, 'subsample_freq': 2}


In [64]:
params = {'n_estimators': 406, 'num_leaves': 40, 'max_depth': 3, 'min_child_samples': 15, 'scale_pos_weight': 11.135712739455382, 'reg_alpha': 0.0014341754109660826, 'reg_lambda': 0.5210082757225214, 'learning_rate': 0.047127077338180494, 'subsample': 0.8230729607127126, 'colsample_bytree': 0.6525305504945635, 'subsample_freq': 2}

In [65]:
tuned_lgb_model = LGBMClassifier(**params)

In [67]:
tuned_lgb_model.fit(X_train, y_train)

[LightGBM] [Info] Number of positive: 12519, number of negative: 95970
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005057 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 596
[LightGBM] [Info] Number of data points in the train set: 108489, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.115394 -> initscore=-2.036788
[LightGBM] [Info] Start training from score -2.036788


0,1,2
,boosting_type,'gbdt'
,num_leaves,40
,max_depth,3
,learning_rate,0.047127077338180494
,n_estimators,406
,subsample_for_bin,200000
,objective,
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [68]:
y_pred = tuned_lgb_model.predict(X_val)

In [69]:
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.95      0.49      0.65     44516
           1       0.17      0.81      0.28      5779

    accuracy                           0.53     50295
   macro avg       0.56      0.65      0.47     50295
weighted avg       0.86      0.53      0.61     50295



In [70]:
val_test_score = roc_auc_score(y_val, lgb_model.predict_proba(X_val)[:, 1])
print(f"Test AUC: {val_test_score:.3f}")

Test AUC: 0.726


In [78]:
val_proba = lgb_model.predict_proba(X_val)[:, 1]

In [99]:
new_threshold = 0.25
y_pred_new = (val_proba >= new_threshold).astype(int)

# Проверь новые метрики
print(classification_report(y_val, y_pred_new))

              precision    recall  f1-score   support

           0       0.91      0.93      0.92     44516
           1       0.32      0.25      0.28      5779

    accuracy                           0.85     50295
   macro avg       0.61      0.59      0.60     50295
weighted avg       0.84      0.85      0.84     50295



In [136]:
df.to_csv("precessed_df.csv")
