# Курсовой проект для курса "Алгоритмы анализа данных"

**Задание:** предсказать вероятность того, подойдет ли репетитор для подготовки к экзамену по математике. Вам будут даны два датасета: train.csv (содержит признаки и целевую переменную) и test.csv (только признаки).

**Целевая переменная:** choose

**Метрика:** ROC AUC

**Описание датасета:**

* **Id** - идентификационный номер
* **age** - возраст репетитора
* **years_of_experience** - стаж преподавания
* **lesson_price** - цена за урок
* **qualification** - квалификация
* **physics** - физика
* **chemistry** - химия
* **biology** - биология
* **english** - ангийский
* **geography** - география
* **history** - история
* **mean_exam_points** - средний балл за экзамен

**План курсового проекта:**
* [I. Загрузка библиотек и функций](#1)
* [II. Чтение данных](#2)
* [III. Исследование данных](#3)
* [IV. Масштабирование данных](#4)
* [V. Модель предсказания целевой переменной](#5)
* [VI. Оценка качества классификации](#6)
* [VII. Предсказание и сохранение финальных данных](#7)

## I. Загрузка библиотек и функций <a class='anchor' id='1'>

In [1]:
# Основные библиотеки
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
# %matplotlib inline

# Разбивает выборку на обучающую и тестовую
from sklearn.model_selection import train_test_split

from imblearn import over_sampling, under_sampling

# Cборщик мусора
import gc 

## II. Чтение данных <a class='anchor' id='2'>

In [37]:
del df
del fdf
gc.collect()

260

In [38]:
df = pd.read_csv('train.csv')
fdf = pd.read_csv('test.csv')

## III. Исследование данных <a class='anchor' id='3'>

In [16]:
# Проверка типов данных train
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 13 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Id                   10000 non-null  int64  
 1   age                  10000 non-null  float64
 2   years_of_experience  10000 non-null  float64
 3   lesson_price         10000 non-null  float64
 4   qualification        10000 non-null  float64
 5   physics              10000 non-null  float64
 6   chemistry            10000 non-null  float64
 7   biology              10000 non-null  float64
 8   english              10000 non-null  float64
 9   geography            10000 non-null  float64
 10  history              10000 non-null  float64
 11  mean_exam_points     10000 non-null  float64
 12  choose               10000 non-null  int64  
dtypes: float64(11), int64(2)
memory usage: 1015.8 KB


In [6]:
# Проверка типов данных test
fdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 12 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Id                   10000 non-null  int64  
 1   age                  10000 non-null  float64
 2   years_of_experience  10000 non-null  float64
 3   lesson_price         10000 non-null  float64
 4   qualification        10000 non-null  float64
 5   physics              10000 non-null  float64
 6   chemistry            10000 non-null  float64
 7   biology              10000 non-null  float64
 8   english              10000 non-null  float64
 9   geography            10000 non-null  float64
 10  history              10000 non-null  float64
 11  mean_exam_points     10000 non-null  float64
dtypes: float64(11), int64(1)
memory usage: 937.6 KB


In [39]:
df.head()

Unnamed: 0,Id,age,years_of_experience,lesson_price,qualification,physics,chemistry,biology,english,geography,history,mean_exam_points,choose
0,0,35.0,0.0,2150.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,74.0,0
1,1,52.0,2.0,1250.0,2.0,1.0,0.0,1.0,0.0,0.0,1.0,57.0,1
2,2,29.0,3.0,1750.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,66.0,0
3,3,33.0,3.0,1050.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,66.0,1
4,4,46.0,3.0,2250.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,73.0,0


In [8]:
fdf.head()

Unnamed: 0,Id,age,years_of_experience,lesson_price,qualification,physics,chemistry,biology,english,geography,history,mean_exam_points
0,10000,32.0,2.0,2700.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,90.0
1,10001,35.0,6.0,1800.0,2.0,1.0,1.0,0.0,0.0,0.0,0.0,71.0
2,10002,44.0,2.0,1200.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,45.0
3,10003,44.0,4.0,2950.0,3.0,1.0,0.0,0.0,0.0,0.0,0.0,92.0
4,10004,38.0,3.0,1400.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,58.0


In [13]:
df.age.sort_values().unique()

array([23., 24., 25., 26., 27., 28., 29., 30., 31., 32., 33., 34., 35.,
       36., 37., 38., 39., 40., 41., 42., 43., 44., 45., 46., 47., 48.,
       49., 50., 51., 52., 53., 54., 55., 56., 57., 58., 59., 60., 61.,
       62., 63., 64., 65., 66., 67., 68.])

In [28]:
fdf.age.sort_values().unique()

array([23., 24., 25., 26., 27., 28., 29., 30., 31., 32., 33., 34., 35.,
       36., 37., 38., 39., 40., 41., 42., 43., 44., 45., 46., 47., 48.,
       49., 50., 51., 52., 53., 54., 55., 56., 57., 58., 59., 60., 61.,
       62., 63., 64., 65., 66., 67., 68.])

In [29]:
df.years_of_experience.sort_values().unique()

array([0., 1., 2., 3., 4., 5., 6., 7., 8., 9.])

In [30]:
fdf.years_of_experience.sort_values().unique()

array([0., 1., 2., 3., 4., 5., 6., 7., 8., 9.])

In [11]:
df.lesson_price.sort_values().unique()

array([ 200.,  350.,  400.,  450.,  500.,  550.,  600.,  650.,  700.,
        750.,  800.,  850.,  900.,  950., 1000., 1050., 1100., 1150.,
       1200., 1250., 1300., 1350., 1400., 1450., 1500., 1550., 1600.,
       1650., 1700., 1750., 1800., 1850., 1900., 1950., 2000., 2050.,
       2100., 2150., 2200., 2250., 2300., 2350., 2400., 2450., 2500.,
       2550., 2600., 2650., 2700., 2750., 2800., 2850., 2900., 2950.,
       3000., 3050., 3100., 3150., 3200., 3250., 3300., 3350., 3400.,
       3450., 3500., 3550., 3600., 3650., 3700., 3750., 3950.])

In [24]:
df[df['lesson_price'] < 500]

Unnamed: 0,Id,age,years_of_experience,lesson_price,qualification,physics,chemistry,biology,english,geography,history,mean_exam_points,choose
788,788,44.0,7.0,350.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,47.0,1
1896,1896,25.0,6.0,350.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,49.0,0
3390,3390,41.0,8.0,200.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,52.0,0
3477,3477,40.0,6.0,400.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,57.0,1
5736,5736,37.0,6.0,400.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,63.0,1
6053,6053,33.0,7.0,450.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,48.0,1
7841,7841,50.0,5.0,200.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,63.0,1
9028,9028,50.0,7.0,350.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,67.0,1
9355,9355,42.0,8.0,350.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,59.0,1


In [31]:
fdf.lesson_price.sort_values().unique()

array([ 300.,  350.,  400.,  450.,  500.,  550.,  600.,  650.,  700.,
        750.,  800.,  850.,  900.,  950., 1000., 1050., 1100., 1150.,
       1200., 1250., 1300., 1350., 1400., 1450., 1500., 1550., 1600.,
       1650., 1700., 1750., 1800., 1850., 1900., 1950., 2000., 2050.,
       2100., 2150., 2200., 2250., 2300., 2350., 2400., 2450., 2500.,
       2550., 2600., 2650., 2700., 2750., 2800., 2850., 2900., 2950.,
       3000., 3050., 3100., 3150., 3200., 3250., 3300., 3350., 3400.,
       3450., 3500., 3550., 3600., 3650., 3700., 3750., 3800., 3850.,
       3950.])

In [32]:
fdf[fdf['lesson_price'] < 500]

Unnamed: 0,Id,age,years_of_experience,lesson_price,qualification,physics,chemistry,biology,english,geography,history,mean_exam_points
21,10021,43.0,6.0,350.0,2.0,1.0,1.0,0.0,0.0,0.0,0.0,71.0
2332,12332,56.0,7.0,450.0,2.0,0.0,0.0,0.0,1.0,0.0,1.0,69.0
2501,12501,42.0,6.0,450.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,61.0
4349,14349,35.0,5.0,450.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,52.0
4954,14954,48.0,7.0,350.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,56.0
6037,16037,41.0,5.0,300.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,61.0
6879,16879,54.0,5.0,450.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,38.0
6948,16948,68.0,6.0,400.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,58.0
7218,17218,52.0,3.0,450.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,49.0
8309,18309,56.0,6.0,400.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,46.0


In [33]:
df.qualification.sort_values().unique(), fdf.qualification.sort_values().unique()

(array([1., 2., 3., 4.]), array([1., 2., 3., 4.]))

In [34]:
df.physics.sort_values().unique(), fdf.physics.sort_values().unique()

(array([0., 1.]), array([0., 1.]))

In [35]:
df.chemistry.sort_values().unique(), fdf.chemistry.sort_values().unique()

(array([0., 1.]), array([0., 1.]))

In [36]:
df.biology.sort_values().unique(), fdf.biology.sort_values().unique()

(array([0., 1.]), array([0., 1.]))

In [37]:
df.english.sort_values().unique(), fdf.english.sort_values().unique()

(array([0., 1.]), array([0., 1.]))

In [38]:
df.geography.sort_values().unique(), fdf.geography.sort_values().unique()

(array([0., 1.]), array([0., 1.]))

In [39]:
df.history.sort_values().unique(), fdf.history.sort_values().unique()

(array([0., 1.]), array([0., 1.]))

In [21]:
df.mean_exam_points.sort_values().unique()

array([ 33.,  34.,  35.,  36.,  37.,  38.,  39.,  40.,  41.,  42.,  43.,
        44.,  45.,  46.,  47.,  48.,  49.,  50.,  51.,  52.,  53.,  54.,
        55.,  56.,  57.,  58.,  59.,  60.,  61.,  62.,  63.,  64.,  65.,
        66.,  67.,  68.,  69.,  70.,  71.,  72.,  73.,  74.,  75.,  76.,
        77.,  78.,  79.,  80.,  81.,  82.,  83.,  84.,  85.,  86.,  87.,
        88.,  89.,  90.,  91.,  92.,  93.,  94.,  95.,  96.,  97.,  98.,
        99., 100.])

In [40]:
fdf.mean_exam_points.sort_values().unique()

array([32., 33., 34., 35., 36., 37., 38., 39., 40., 41., 42., 43., 44.,
       45., 46., 47., 48., 49., 50., 51., 52., 53., 54., 55., 56., 57.,
       58., 59., 60., 61., 62., 63., 64., 65., 66., 67., 68., 69., 70.,
       71., 72., 73., 74., 75., 76., 77., 78., 79., 80., 81., 82., 83.,
       84., 85., 86., 87., 88., 89., 90., 91., 92., 93., 94., 95., 96.,
       97., 98.])

## IV. Масштабирование данных <a class='anchor' id='4'>

In [40]:
def standard_scale(x):
    mean = x.mean(axis=0)
    std = x.std(axis=0)
    return (x - mean) / std

In [92]:
X = np.array(df.drop(['choose', 'Id'], axis=1))
y = np.array(df['choose'])
ind = np.array(df['Id'])

X_f = np.array(fdf.drop('Id', axis=1))
ind_f = np.array(fdf[['Id']])

In [93]:
X_st = X.copy()
X_st = standard_scale(X)
X_st

array([[-1.34508978e+00, -1.11773006e+00,  8.54508832e-01, ...,
        -1.68787291e-01, -1.35388105e-01,  7.03586704e-01],
       [ 7.72004745e-01,  1.42631140e-02, -8.63826025e-01, ...,
        -1.68787291e-01,  7.38617327e+00, -5.46933324e-01],
       [-2.09229961e+00,  5.80259700e-01,  9.08044509e-02, ...,
        -1.68787291e-01, -1.35388105e-01,  1.15106691e-01],
       ...,
       [-2.24275031e-01,  1.71225287e+00,  9.08044509e-02, ...,
        -1.68787291e-01,  7.38617327e+00, -3.99813321e-01],
       [-5.97879947e-01, -1.11773006e+00, -4.65859672e-03, ...,
        -1.68787291e-01, -1.35388105e-01,  5.56466700e-01],
       [-5.97879947e-01,  1.71225287e+00, -9.59289073e-01, ...,
        -1.68787291e-01, -1.35388105e-01, -1.79745335e+00]])

In [94]:
X_stf = X_f.copy()
X_stf = standard_scale(X_f)
X_stf

array([[-1.73371969,  0.00807395,  1.90046712, ..., -0.17343093,
        -0.12382046,  1.90506341],
       [-1.36019396,  2.26652127,  0.19020064, ..., -0.17343093,
        -0.12382046,  0.50485588],
       [-0.23961676,  0.00807395, -0.94997702, ..., -0.17343093,
        -0.12382046, -1.41121759],
       ...,
       [-0.23961676,  0.00807395, -0.85496222, ..., -0.17343093,
        -0.12382046, -0.08470519],
       [ 0.63194329,  1.70190944, -1.33003624, ..., -0.17343093,
        -0.12382046, -0.01101005],
       [-0.36412533, -1.12114971, -0.37988819, ..., -0.17343093,
        -0.12382046, -1.70599812]])

## V. Модель предсказания целевой переменной <a class='anchor' id='5'>

In [149]:
# разбиваем датафреймы на тренировочные и валидационные
X_train, X_valid, y_train, y_valid = train_test_split(X_st, y, test_size=0.33, shuffle=True, random_state=19)

In [150]:
X_train.shape, X_valid.shape

((6700, 11), (3300, 11))

In [151]:
def calc_logloss(y, y_pred):
    err = - np.mean(y * np.log(y_pred) + (1.0 - y) * np.log(1.0 - y_pred))
    return err

In [152]:
def sigmoid(z):
    res = 1 / (1 + np.exp(-z))
    return res

In [153]:
def eval_model(X, y, iterations, eta=1e-4):
    np.random.seed(42)
    W = np.random.randn(X.shape[1])
    n = X.shape[0]
    
    for i in range(iterations+1):
        z = np.dot(X, W)
        y_pred = sigmoid(z)
        err = calc_logloss(y, y_pred)
        W -= eta * (1/n * X.T @ (y_pred - y))
        if i % (iterations / 10) == 0:
            print(i, W, err)
            
    return W

In [162]:
W = eval_model(X_train, y_train, iterations=700, eta=0.1)

0 [ 0.49207209 -0.1412805   0.62219385  1.50053422 -0.22412632 -0.22855176
  1.56405251  0.7588608  -0.46439315  0.53613681 -0.47707005] 1.2521412370018756
70 [ 0.19320778 -0.07146205 -0.20712364  0.82252163  0.25268462  0.04958068
  0.63073817  0.25984538 -0.14075492  0.16077191 -0.55231724] 0.7450121697398624
140 [ 0.08085407  0.0237987  -0.42202173  0.67145795  0.32132387  0.11821109
  0.2299368   0.08651305 -0.0111974   0.02643077 -0.26893105] 0.6795353351516755
210 [ 0.05008053  0.0523062  -0.54004823  0.58191631  0.29738272  0.12640884
  0.1326306   0.0468344   0.01434518 -0.00299352 -0.06527559] 0.6676687802171657
280 [ 0.04150787  0.06107589 -0.61394586  0.52082671  0.27055482  0.12390398
  0.11011665  0.03794568  0.01748859 -0.00944223  0.07263426] 0.6633108892081986
350 [ 0.03869472  0.06442475 -0.66146074  0.47832014  0.25110748  0.12060237
  0.10380907  0.03559483  0.01679168 -0.01126232  0.16585559] 0.6613819766786004
420 [ 0.03750557  0.06604246 -0.69236573  0.44859639  0

In [163]:
def calc_pred(W, X):
    y_pred_proba = sigmoid(np.dot(X, W))
    y_pred = np.where(y_pred_proba > 0.5, 1, 0)

    return y_pred

In [167]:
y_train[1:20]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0])

In [164]:
y_train_pred = calc_pred(W, X_train)
y_train_pred[1:20]

array([1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0])

In [168]:
y_valid[1:20]

array([0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1])

In [169]:
y_valid_pred = calc_pred(W, X_valid)
y_valid_pred[1:20]

array([0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1])

## VI. Оценка качества классификации <a class='anchor' id='6'>

In [170]:
# TP  FP
# FN  TN

def matrix(y, y_pred):
    
    conf_matrix = np.zeros(shape=(2, 2))
    
    conf_matrix[0, 0] = np.sum(np.where(y == 0, y_pred == y, False)) # TP
    conf_matrix[1, 1] = np.sum(np.where(y == 1, y_pred == y, False)) # TN
    conf_matrix[1, 0] = np.sum(np.where(y == 1, y_pred != y, False)) # FN
    conf_matrix[0, 1] = np.sum(np.where(y == 0, y_pred != y, False)) # FP
    
    return conf_matrix

In [171]:
matrix(y_train, y_train_pred)

array([[3345., 2607.],
       [  65.,  683.]])

In [172]:
matrix(y_valid, y_valid_pred)

array([[1672., 1267.],
       [  32.,  329.]])

In [192]:
from sklearn.metrics import confusion_matrix as cm

cm(y_valid, y_valid_pred)

array([[1672, 1267],
       [  32,  329]])

In [193]:
def TPR(y, y_pred):
    
    mtx = matrix(y, y_pred)
    
    TP = mtx[0, 0] 
    FN = mtx[1, 0]
    
    return TP / (TP + FN)

In [197]:
def FPR(y, y_pred):
    
    mtx = matrix(y, y_pred)
    
    FP = mtx[0, 1] 
    TN = mtx[1, 1]

    return FP / (FP + TN), 1

In [148]:
TPR = TPR(y_valid, y_valid_pred)
FPR = FPR(y_valid, y_valid_pred)

AUC_ROC = np.trapz(TPR, x = FPR, dx=0.1)
AUC_ROC
# plt.title('ROC curve')
# plt.ylim(0, 1.05)
# plt.xlabel('FPR')
# plt.ylabel('TPR')
# plt.grid()
# plt.legend(' ', title=f'AUC-ROC={AUC_ROC:.3f}', loc='lower right')
# plt.plot(FPR, TPR);

ValueError: diff requires input that is at least one dimensional

In [198]:
FPR(y_valid, y_valid_pred)

(0.793859649122807, 1)

In [144]:
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score

In [200]:
fpr, tpr, _ = roc_curve(y_valid, y_valid_pred)
fpr, tpr, _

(array([0.        , 0.43109901, 1.        ]), array([0.        , 0.91135734, 1.        ]), array([2, 1, 0]))

In [201]:
roc_auc_score(y_valid, y_valid_pred)

0.740129163725201

In [202]:
AUC_ROC = np.trapz(tpr, x = fpr, dx=0.1)
AUC_ROC

0.740129163725201

## VII. Предсказание и сохранение финальных данных <a class='anchor' id='7'>

In [189]:
y_pred = calc_pred(W, X_stf)
final_df = pd.DataFrame({'Id':ind_f[:,0], 'choose':y_pred})
# final_df.to_csv('predict.csv', index=False, encoding='utf-8')