In [2]:
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
import statsmodels.api as sm
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold

plt.style.use('seaborn-ticks')
%matplotlib inline

In [3]:
# Wczytanie danych
df = pd.read_csv("data/heart.csv")
print(df.shape)
pd.set_option("display.max_columns",50)
df.head()

(3656, 16)


Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,1,39,4.0,0,0.0,0.0,0,0,0,195.0,106.0,70.0,26.97,80.0,77.0,0
1,0,46,2.0,0,0.0,0.0,0,0,0,250.0,121.0,81.0,28.73,95.0,76.0,0
2,1,48,1.0,1,20.0,0.0,0,0,0,245.0,127.5,80.0,25.34,75.0,70.0,0
3,0,61,3.0,1,30.0,0.0,0,1,0,225.0,150.0,95.0,28.58,65.0,103.0,1
4,0,46,3.0,1,23.0,0.0,0,0,0,285.0,130.0,84.0,23.1,85.0,85.0,0


In [4]:
features = df.columns.tolist()
features.remove('TenYearCHD')
rhs = "+".join(features)

In [5]:
# Zaimportowanie funkcji train_test_split z sklearn
from sklearn.model_selection import train_test_split
# Podzielenie zbioru na treningowy i testowy
X_train, X_test, y_train, y_test = train_test_split(df, df.TenYearCHD, test_size=0.3, random_state=0)
print(X_train.shape, X_test.shape)

(2559, 16) (1097, 16)


In [6]:
for k in range(1, 10):
    X_train, X_test, y_train, y_test = train_test_split(df, df.TenYearCHD, test_size=0.1*k, random_state=2020)
    mod = sm.GLM.from_formula(formula="TenYearCHD~"+rhs, data=X_train, 
                              family=sm.families.Binomial())
    res = mod.fit()
    # Liczymy predykcje na zbiorze treningowym
    predsTrain = res.predict()
    # Liczymy predykcje na zbiorze walidacyjnym
    preds = res.predict(X_test)
    print(f"test: {k/10}, Train AUC:", round(roc_auc_score(y_train, predsTrain), 4),
          "Valid AUC:", round(roc_auc_score(y_test, preds), 4))

test: 0.1, Train AUC: 0.7394 Valid AUC: 0.731
test: 0.2, Train AUC: 0.7401 Valid AUC: 0.7279
test: 0.3, Train AUC: 0.7358 Valid AUC: 0.7417
test: 0.4, Train AUC: 0.7377 Valid AUC: 0.7352
test: 0.5, Train AUC: 0.7349 Valid AUC: 0.733
test: 0.6, Train AUC: 0.7379 Valid AUC: 0.7313
test: 0.7, Train AUC: 0.7353 Valid AUC: 0.7198
test: 0.8, Train AUC: 0.7513 Valid AUC: 0.7155
test: 0.9, Train AUC: 0.7794 Valid AUC: 0.717


Przetrenowanie powyżej 0.7, dobry wynik dla 0.3.

In [7]:
# Wczytanie funkcji KFold
from sklearn.model_selection import KFold

# Stworzenie funkcji do dzielenia foldów (w tym przypadku w walidacji 10 razy składanej)
kf = KFold(n_splits=10, shuffle=True, random_state=2020)

# Aby oszczędzać pamięć informacja o foldach to wyłącznie numery wierszy
for train, test in kf.split(df.index.values): # tutaj następuje podział
    # Stworzenie i estymacja modelu
    mod = sm.GLM.from_formula(formula="TenYearCHD~"+rhs,
                              data=df.iloc[train], family=sm.families.Binomial())
    res = mod.fit()
    # Zapisanie predykcji na zbiorze treningowym w wektorze predsTrain
    predsTrain = res.predict()
    # Zapisanie predykcji na zbiorze walidacyjnym w wektorze predsTest
    predsTest = res.predict(df.iloc[test])
    print("Train AUC:", np.round(roc_auc_score(df.TenYearCHD.iloc[train], predsTrain), 4),
          "Valid AUC:", np.round(roc_auc_score(df.TenYearCHD.iloc[test], predsTest), 4))

Train AUC: 0.7394 Valid AUC: 0.731
Train AUC: 0.7399 Valid AUC: 0.7246
Train AUC: 0.736 Valid AUC: 0.7668
Train AUC: 0.7414 Valid AUC: 0.711
Train AUC: 0.7378 Valid AUC: 0.7392
Train AUC: 0.7428 Valid AUC: 0.7131
Train AUC: 0.7366 Valid AUC: 0.7542
Train AUC: 0.742 Valid AUC: 0.7111
Train AUC: 0.7438 Valid AUC: 0.6938
Train AUC: 0.736 Valid AUC: 0.7663


In [8]:
def CVTest(nFolds = 5, randomState=2020, debug=False):
    kf = KFold(n_splits=nFolds, shuffle=True, random_state=randomState)

    # Listy do zapisywania wyników
    testResults = []
    trainResults = []
    predictions = []
    indices = []
    
    for train, test in kf.split(df.index.values):
        # Estymacja modelu GLM
        mod = sm.GLM.from_formula(formula="TenYearCHD~"+rhs,
                                  data=df.iloc[train], family=sm.families.Binomial())
        res = mod.fit()
        predsTrain = res.predict()
        preds = res.predict(df.iloc[test])
        
        # Zachowajmy informacje o predykcjach dla tego foldu
        predictions.append(preds.tolist().copy())
        
        # Razem z indeksami w oryginalnym data frame
        indices.append(df.iloc[test].index.tolist().copy())
        
        # Informowanie o każdym foldzie razem z wynikami treningowymi możemy opcjonalnie wyświetlać w trakcie
        trainScore = roc_auc_score((df.TenYearCHD.iloc[train]==1).astype(int), predsTrain)
        testScore = roc_auc_score((df.TenYearCHD.iloc[test]==1).astype(int), preds)
        
        # Zapisanie wyników dopasowania w foldach
        trainResults.append(trainScore)
        testResults.append(testScore)
        
        if debug:
            print("Train AUC:", trainScore,
                  "Valid AUC:", testScore)

    return trainResults, testResults, predictions, indices
 

In [9]:
trainResults, testResults, predictions, indices = CVTest(nFolds = 10, randomState=2020)
print(np.mean(trainResults), np.mean(testResults))

0.7395719878621537 0.7311313397929364


In [10]:
resultM1 = {
    "name": "Ekonometria",
    "description":"Model ekonometryczny",
    "specification": "TenYearCHD~"+rhs,
    "trainResults":trainResults.copy(),
    "testResults":testResults.copy(),
    "predictions":predictions.copy(),
    "indices":indices.copy(),
}

In [11]:
import pickle

# Otwieramy plik do zapisu binarnego z wykorzystenim with
with open("model_ekonometria_1.p", "wb") as fp:
    # Zapisujemy obiekt do wskaźnika pliku
    pickle.dump(resultM1, fp)