In [None]:
outlier_cols = [] 
for column in train_num.columns:
    Q1 = train_num[column].quantile(0.25)
    Q3 = train_num[column].quantile(0.75)
    IQR = Q3 - Q1 

    outliers = (train_num[column] < Q1 - 1.5 * IQR) | (train_num[column] > Q3 + 1.5 * IQR)
    
    if any(outliers):
        outlier_cols.append(column)

outlier_cols


In [None]:
from math import *

fig, axes = plt.subplots(nrows=ceil(len(outlier_cols) / 3), ncols=3, figsize=(15, 5 * ceil(len(outlier_cols) / 3)))

for i, column in enumerate(outlier_cols):
    row, col = divmod(i, 3)
    sns.boxplot(x=train_num[column], ax=axes[row, col], color='skyblue')
    axes[row, col].set_title(f'Box Plot for {column}', fontsize=20)

plt.tight_layout()
plt.show()

In [None]:
correlation = test_num.corr()
sns.heatmap(correlation, annot=True, cmap='coolwarm')
plt.show()
plt.figure(figsize=(23, 18))

In [None]:
large_outliers_cols = ['Credit_Limit', 'Avg_Open_To_Buy', 'Total_Amt_Chng_Q4_Q1',
                    'Total_Trans_Amt', 'Total_Ct_Chng_Q4_Q1']

def remove_outliers(frame, column):
    Q1 = frame[column].quantile(0.25)
    Q3 = frame[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
        
    return frame[(frame[column] >= lower_bound) & (frame[column] <= upper_bound)]

for col in large_outliers_cols:
    df = remove_outliers(df, col)

In [None]:
from sklearn.utils import resample

train_fraud = train[train['target'] == 1]  
train_non_fraud = train[train['target'] == 0] 


train_non_fraud_balanced = resample(train_non_fraud, 
                                replace=False,    
                                n_samples=len(train_fraud), 
                                random_state=42)   

train_balanced = pd.concat([train_fraud, train_non_fraud_balanced])

train_balanced = train_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

print(f'Размер сбалансированного набора данных: {train_balanced.shape}')
print(train_balanced['target'].value_counts())

In [None]:
train[train['target'] == 1]

In [None]:
coldr = ['session_id','target']
X = train_balanced.drop(columns=coldr, axis=1)
y = train_balanced['target']
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

numeric_columns_X = X.select_dtypes(include='number').columns
X[numeric_columns_X] = scaler.fit_transform(X[numeric_columns_X])

test_bebra = test.drop(columns='session_id')
numeric_columns_test = test_bebra.select_dtypes(include='number').columns
test_bebra[numeric_columns_test] = scaler.transform(test_bebra[numeric_columns_test])

In [None]:
X_train_full, X_temp, y_train_full, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)

X_valid, X_test, y_valid, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [None]:
reg_model = LogisticRegression(random_state=42, max_iter=1000)
reg_model.fit(X_train_full, y_train_full)
y_pred_proba_reg = reg_model.predict_proba(X_valid)[:, 1]  

roc_auc = roc_auc_score(y_valid, y_pred_proba_reg)
print(f'ROC-AUC на отложенной выборке: {roc_auc}')

In [None]:
from sklearn.model_selection import GridSearchCV, learning_curve, validation_curve


xgb_model = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')

param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 1.0]
}

grid_search = GridSearchCV(estimator=xgb_model, 
                        param_grid=param_grid, 
                        scoring='roc_auc', 
                        cv=3, 
                        verbose=1, 
                        n_jobs=-1)

grid_search.fit(X_train_full, y_train_full)

best_params = grid_search.best_params_
print(f'Best parameters: {best_params}')

best_xgb = grid_search.best_estimator_
y_pred_proba_xgb_best = best_xgb.predict_proba(X_valid)[:, 1]
roc_auc_xgb_best = roc_auc_score(y_valid, y_pred_proba_xgb_best)
print(f'Tuned XGBoost ROC-AUC: {roc_auc_xgb_best}')

In [None]:
import matplotlib.pyplot as plt
feature_importances = best_xgb.feature_importances_
features = X.columns
sns.barplot(x=feature_importances, y=features)
plt.title('Feature Importance for Best XGBoost Model')
plt.show()

In [None]:
param_range = [50, 100, 150]
train_scores, test_scores = validation_curve(
    XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss'),
    X_train_full, y_train_full,
    param_name='n_estimators',
    param_range=param_range,
    scoring='roc_auc',
    cv=3,
    n_jobs=-1
)

train_scores_mean = np.mean(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)

plt.figure(figsize=(8, 6))
plt.plot(param_range, train_scores_mean, label="Training score", marker='o')
plt.plot(param_range, test_scores_mean, label="Validation score", marker='o')
plt.title("Validation Curve for XGBClassifier")
plt.xlabel("Number of Estimators")
plt.ylabel("ROC-AUC")
plt.legend(loc="best")
plt.grid()
plt.show()

In [None]:
train_sizes, train_scores, test_scores = learning_curve(
    best_xgb,
    X_train_full, y_train_full,
    train_sizes=np.linspace(0.1, 1.0, 10),
    cv=3,
    scoring='roc_auc',
    n_jobs=-1
)

train_scores_mean = np.mean(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)

plt.figure(figsize=(8, 6))
plt.plot(train_sizes, train_scores_mean, label="Training score", marker='o')
plt.plot(train_sizes, test_scores_mean, label="Validation score", marker='o')
plt.title("Learning Curve for Best XGBClassifier")
plt.xlabel("Training Set Size")
plt.ylabel("ROC-AUC")
plt.legend(loc="best")
plt.grid()
plt.show()


In [None]:
output = pd.DataFrame({
    'session_id': test['session_id'], 
    'target': y_pred_proba_test
})

output.to_csv('predictions.csv', index=False)

In [None]:
from fastapi import FastAPI, UploadFile, File
from io import StringIO
import pandas as pd
import pickle

with open('xgb_model.pkl', 'rb') as f:
    model = pickle.load(f)

app = FastAPI()

def add_features(data, time_columns, site_columns, time_column_for_year_month):
    data['year_month'] = pd.to_datetime(data[time_column_for_year_month], errors='coerce').dt.strftime('%Y%m').astype('float')
    data['average_time_gap'] = data[time_columns].apply(
        lambda row: pd.Series(pd.to_datetime(row, errors='coerce')).diff().mean().total_seconds()
        if row.notnull().all() else None, axis=1
    )
    data['session_length'] = (pd.to_datetime(data[time_columns[-1]], errors='coerce') - 
                            pd.to_datetime(data[time_columns[0]], errors='coerce')).dt.total_seconds()
    data['unique_sites'] = data[site_columns].nunique(axis=1)
    return data

@app.post("/predict/")
async def predict(file: UploadFile = File(...)):
    contents = await file.read()
    data = pd.read_csv(StringIO(contents.decode('utf-8')))

    time_columns = [col for col in data.columns if col.startswith('time')]
    site_columns = [col for col in data.columns if col.startswith('site')]
    data = add_features(data, time_columns, site_columns, time_columns[0])

    data = data.drop(columns=time_columns)
    data = data.drop(columns=['session_id'], axis=1)
    predictions = model.predict(data)

    return {"predictions": predictions.tolist()}

In [None]:
import streamlit as st
import pandas as pd
import requests
st.title("Прогнозирование злоумышленника")
st.write("Загрузите файл сессий для обработки и анализа.")

uploaded_file = st.file_uploader("Загрузите CSV файл", type="csv")

if uploaded_file is not None:
    data = uploaded_file.read()
    response = requests.post("http://127.0.0.1:8000/predict/", files={"file": data})

    if response.status_code == 200:
        result = pd.DataFrame(response.json())
        st.write("Обработанные данные и предсказания:")
        st.write(result)

        if st.button("Сохранить результаты"):
            result.to_csv("results.csv", index=False)
            st.success("Результаты сохранены в файл results.csv.")
    else:
        st.error("Ошибка обработки данных.")

