# DS PROJECT - Predicting the outcome of driving exams in Estonia

### Reading data

In [5]:
import pandas as pd

data = pd.read_csv("cleanedData.csv")

data.head()

Unnamed: 0,SEISUND,KESTUS,VARASEMAID_KATSEID,BYROO_Haapsalu,BYROO_Jõgeva,BYROO_Jõhvi,BYROO_Kuressaare,BYROO_Kärdla,BYROO_Narva,BYROO_Paide,...,KUU_3,KUU_4,KUU_5,KUU_6,KUU_7,KUU_8,KUU_9,KUU_10,KUU_11,KUU_12
0,1,50.0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,24.0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,24.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,48.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,51.0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Train-test split

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

X = data.drop(["SEISUND"], axis=1)
y = data["SEISUND"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train_sub, X_val, y_train_sub, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)



### DecisionTree

In [48]:
#Hold-out validation
from sklearn.tree import DecisionTreeClassifier

models_df = pd.DataFrame(columns=['model','params','accuracy','test_accuracy'])

criterions=['gini','entropy']
max_depths=[None,1,2,3,4,5,6,7,8,9,10]
random_state = 0

for crit in criterions:
    for max_depth in max_depths:
        model = DecisionTreeClassifier(criterion = crit, max_depth = max_depth, random_state = random_state)
        model.fit(X_train_sub, y_train_sub)
        acc = accuracy_score(y_val, model.predict(X_val))
        params = model.get_params()
        #test
        acc_test = accuracy_score(y_test,model.predict(X_test))
        models_df.loc[len(models_df)] = pd.Series(["DecisionTree",params,acc,acc_test], index=['model', 'params', 'accuracy','test_accuracy'])

In [52]:
models_df.sort_values(by=['accuracy','test_accuracy'], ascending=False)

Unnamed: 0,model,params,accuracy,test_accuracy
19,DecisionTree,"{'ccp_alpha': 0.0, 'class_weight': None, 'crit...",0.893824,0.892766
20,DecisionTree,"{'ccp_alpha': 0.0, 'class_weight': None, 'crit...",0.893724,0.894042
10,DecisionTree,"{'ccp_alpha': 0.0, 'class_weight': None, 'crit...",0.893525,0.893922
21,DecisionTree,"{'ccp_alpha': 0.0, 'class_weight': None, 'crit...",0.892927,0.893484
8,DecisionTree,"{'ccp_alpha': 0.0, 'class_weight': None, 'crit...",0.892677,0.892846
9,DecisionTree,"{'ccp_alpha': 0.0, 'class_weight': None, 'crit...",0.892627,0.892806
18,DecisionTree,"{'ccp_alpha': 0.0, 'class_weight': None, 'crit...",0.892627,0.892527
7,DecisionTree,"{'ccp_alpha': 0.0, 'class_weight': None, 'crit...",0.892179,0.892128
6,DecisionTree,"{'ccp_alpha': 0.0, 'class_weight': None, 'crit...",0.891481,0.89133
5,DecisionTree,"{'ccp_alpha': 0.0, 'class_weight': None, 'crit...",0.889836,0.890094


In [61]:
models_df.loc[10]['params'],#best decisiontree with hold-out validation

({'ccp_alpha': 0.0,
  'class_weight': None,
  'criterion': 'gini',
  'max_depth': 10,
  'max_features': None,
  'max_leaf_nodes': None,
  'min_impurity_decrease': 0.0,
  'min_samples_leaf': 1,
  'min_samples_split': 2,
  'min_weight_fraction_leaf': 0.0,
  'random_state': 0,
  'splitter': 'best'},)

In [54]:
#K-fold cross validation
from sklearn.model_selection import cross_val_score
import numpy as np

results_df = pd.DataFrame(columns=['model', 'params', 'accuracy'])

for crit in criterions:
    for max_depth in max_depths:
        model = DecisionTreeClassifier(criterion = crit, max_depth = max_depth)
        scores = cross_val_score(model, X_train, y_train, cv=5)
        params = model.get_params()
        results_df.loc[len(results_df)] = pd.Series(["DecisionTree",params,np.mean(scores)], index=['model', 'params', 'accuracy'])

In [57]:
results_df.sort_values('accuracy', ascending=False)

Unnamed: 0,model,params,accuracy
10,DecisionTree,"{'ccp_alpha': 0.0, 'class_weight': None, 'crit...",0.896063
9,DecisionTree,"{'ccp_alpha': 0.0, 'class_weight': None, 'crit...",0.895734
21,DecisionTree,"{'ccp_alpha': 0.0, 'class_weight': None, 'crit...",0.895355
8,DecisionTree,"{'ccp_alpha': 0.0, 'class_weight': None, 'crit...",0.895285
20,DecisionTree,"{'ccp_alpha': 0.0, 'class_weight': None, 'crit...",0.895185
19,DecisionTree,"{'ccp_alpha': 0.0, 'class_weight': None, 'crit...",0.895016
7,DecisionTree,"{'ccp_alpha': 0.0, 'class_weight': None, 'crit...",0.894986
18,DecisionTree,"{'ccp_alpha': 0.0, 'class_weight': None, 'crit...",0.894288
6,DecisionTree,"{'ccp_alpha': 0.0, 'class_weight': None, 'crit...",0.892743
5,DecisionTree,"{'ccp_alpha': 0.0, 'class_weight': None, 'crit...",0.891467


In [59]:
results_df.loc[10].params #K-fold best

{'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': 10,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'random_state': None,
 'splitter': 'best'}

In [63]:
#Grid-search
from sklearn.model_selection import GridSearchCV
parameters = {"criterion": criterions,
              "max_depth": max_dephts}

model = DecisionTreeClassifier()
clf = GridSearchCV(model, parameters)
clf.fit(X_train, y_train)

In [64]:
clf.best_params_

{'criterion': 'gini', 'max_depth': 1}

In [65]:
acc = accuracy_score(y_test, clf.predict(X_test))
acc

0.8837135109267826

In [3]:
from sklearn.ensemble import RandomForestClassifier


# Choose a model (Random Forest Classifier in this example)
model = RandomForestClassifier()

# Model training
model.fit(X_train, y_train)

# Model prediction
y_pred = model.predict(X_test)

# Model evaluation
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.8888977508374541
