In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    confusion_matrix, roc_curve, auc, precision_recall_curve
)
import seaborn as sns
sns.set(style="whitegrid")
%matplotlib inline

In [None]:
# Our dataset 
df = pd.read_csv('student-mat.csv', delimiter=';')  
df.head()
df.info()
df.describe(include='all')

In [None]:
 # Each column presented using boxplot
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
for col in numeric_cols:
    plt.figure(figsize=(6,2))
    sns.boxplot(x=df[col])
    plt.title(col)
    plt.show()

In [None]:
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
plt.figure(figsize=(10,8))
sns.heatmap(df[num_cols].corr(), annot=True, fmt=".2f", cmap='RdBu')
plt.title('correlation matrix')
plt.show()

In [None]:
cat_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
num_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
print("Categorical columns:", cat_cols)
print("Number columns:", num_cols)



In [None]:
# category razdelenie EZHE BLYA

df['G3_category'] = pd.cut(
    df['G3'],
    bins=[-1, 9, 13, 16, 20],
    labels=['Fail', 'Pass', 'Good', 'Excellent']
)

print("Category distribution:")
print(df['G3_category'].value_counts())
print("\nProportion:")
print(df['G3_category'].value_counts(normalize=True))

plt.figure(figsize=(8,4))
sns.countplot(data=df, x='G3_category', order=['Fail', 'Pass', 'Good', 'Excellent'])
plt.title('Distribution of final grades by category')
plt.show()

In [None]:
# Removing G1 and G2 from features
# X = df.drop(['G3', 'G3_category', 'G1', 'G2'], axis=1)
# y = df['G3_category']

# print("Features (X):", X.shape)
# print("Target (y):", y.shape)
# print("\nColumns in X:")
# print(X.columns.tolist())

In [None]:
# Гистограммы численных признаков
fig, axes = plt.subplots(4, 4, figsize=(16, 12))
axes = axes.flatten()

for i, col in enumerate(num_cols):
    if i < len(axes):
        sns.histplot(X[col], kde=True, ax=axes[i], bins=20)
        axes[i].set_title(col)
        axes[i].set_xlabel('')

plt.tight_layout()
plt.show()

# Boxplot'ы для выбросов
fig, axes = plt.subplots(4, 4, figsize=(16, 12))
axes = axes.flatten()

for i, col in enumerate(num_cols):
    if i < len(axes):
        sns.boxplot(x=X[col], ax=axes[i])
        axes[i].set_title(col)

plt.tight_layout()
plt.show()