In [31]:
!pip install duckdb==0.5.1

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [32]:
from google.colab import drive
drive.mount('/content/drive')

#from google.colab import drive
#drive.flush_and_unmount()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [33]:
import duckdb
import pandas as pd
import os
from IPython.display import display
import numpy as np
import random

random.seed(10)

In [34]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

### Declaration of auxiliar functions

In [35]:
def generate_table(conn, df, name):
        drop_table(conn, name)
        existingTables=conn.execute("SHOW TABLES").fetchall()
        if(len(existingTables)>0):
            for table in existingTables:
                if(table[0] == name):
                    conn.execute("DROP TABLE " + name)
        conn.execute("CREATE TABLE " + name + " AS SELECT * FROM df")
        
def drop_table(conn, name):
        existingTables=conn.execute("SHOW TABLES").fetchall()
        if(len(existingTables)>0):
            for table in existingTables:
                if(table[0] == name):
                    conn.execute("DROP TABLE " + name)

# **Loading training and testing datasets**

In [60]:
conn = duckdb.connect("/content/drive/MyDrive/ADSDB BASES DE DATOS/copy_DB_Train&Test",read_only=False)
existingTables=conn.execute("SHOW TABLES").fetchall()
existingTables

[('test_data',), ('train_data',)]

In [37]:
train=conn.execute("SELECT * from train_data".format()).fetchdf()
test=conn.execute("SELECT * from test_data".format()).fetchdf()

In [38]:
#show the table 
train.head()

Unnamed: 0,pc1,pc2,pc3,pc4,pc5,pc6,pc7,pc8,pc9,pc10,pc11,pc12,pc13,pc14,pc15,status
0,-1.21232,-0.744516,-0.258124,-2.104505,-1.048234,-0.766083,-0.589792,-1.146065,-0.493231,0.532046,-1.103725,0.139414,-0.328295,-0.07656,-0.429384,operating
1,-0.356183,-0.402907,-0.64567,-0.275511,0.659401,0.151428,0.241339,-0.066138,0.17294,0.075112,0.075376,0.33233,0.178172,-0.600583,-0.396184,operating
2,-0.248418,-0.761326,1.770784,0.502847,0.218923,-0.110257,0.63222,0.362389,0.369597,0.188637,-0.028121,-0.172988,-0.333601,-0.625036,-0.376714,acquired
3,26.54471,-2.605882,1.128625,2.902888,3.542717,-5.455574,-1.4621,6.853976,-4.11602,-3.496515,6.276827,0.222887,1.941,4.431875,-2.10465,closed
4,9.634521,-1.123115,-1.8803,-0.003073,4.63551,-0.442441,4.319383,3.349798,10.212528,-2.228692,-2.206531,11.068832,8.132775,2.694862,8.290349,operating


In [39]:
test.head()

Unnamed: 0,pc1,pc2,pc3,pc4,pc5,pc6,pc7,pc8,pc9,pc10,pc11,pc12,pc13,pc14,pc15,status
0,-1.21232,-0.744516,-0.258124,-2.104505,-1.048234,-0.766083,-0.589792,-1.146065,-0.493231,0.532046,-1.103725,0.139414,-0.328295,-0.07656,-0.429384,operating
1,-0.356183,-0.402907,-0.64567,-0.275511,0.659401,0.151428,0.241339,-0.066138,0.17294,0.075112,0.075376,0.33233,0.178172,-0.600583,-0.396184,operating
2,-0.248418,-0.761326,1.770784,0.502847,0.218923,-0.110257,0.63222,0.362389,0.369597,0.188637,-0.028121,-0.172988,-0.333601,-0.625036,-0.376714,acquired
3,26.54471,-2.605882,1.128625,2.902888,3.542717,-5.455574,-1.4621,6.853976,-4.11602,-3.496515,6.276827,0.222887,1.941,4.431875,-2.10465,closed
4,9.634521,-1.123115,-1.8803,-0.003073,4.63551,-0.442441,4.319383,3.349798,10.212528,-2.228692,-2.206531,11.068832,8.132775,2.694862,8.290349,operating


In [40]:
X_train = train.iloc[:, train.columns != "status"]
y_train = train.status
X_test = test.iloc[:, test.columns != "status"]
y_test = test.status

In [41]:
conn.close()

# **DecisionTree hyperparameter optimization using Grid Search**

In [42]:
# Load libraries
import pandas as pd
from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier
from sklearn.model_selection import train_test_split # Import train_test_split function
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation
from sklearn import decomposition, datasets
from sklearn import tree
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler


In [43]:
dec_tree = tree.DecisionTreeClassifier()

In [44]:
dec_tree.get_params().keys()

dict_keys(['ccp_alpha', 'class_weight', 'criterion', 'max_depth', 'max_features', 'max_leaf_nodes', 'min_impurity_decrease', 'min_samples_leaf', 'min_samples_split', 'min_weight_fraction_leaf', 'random_state', 'splitter'])

In [45]:
pipe = Pipeline(steps=[('dec_tree', dec_tree)])

In [46]:
criterion = ['gini', 'entropy']
max_depth = [6,8,10,12,20,30,40]
min_samples_splits = [0.2,0.4,0.6,0.8,1.]#min_samples_split represents the minimum number of samples required to split an internal node
min_samples_leafs = [0.1, 0.2, 0.3, 0.4, 0.5]#The minimum number of samples required to be at a leaf node. 
#max_features = list(range(1,X_train.shape[1]))#max_features represents the number of features to consider when looking for the best split.

In [47]:
parameters = dict(dec_tree__criterion=criterion,
                  dec_tree__max_depth=max_depth,
                  dec_tree__min_samples_split=min_samples_splits,
                  dec_tree__min_samples_leaf=min_samples_leafs,
                  #dec_tree__max_features=max_features
                  )

In [48]:
parameters

{'dec_tree__criterion': ['gini', 'entropy'],
 'dec_tree__max_depth': [6, 8, 10, 12, 20, 30, 40],
 'dec_tree__min_samples_split': [0.2, 0.4, 0.6, 0.8, 1.0],
 'dec_tree__min_samples_leaf': [0.1, 0.2, 0.3, 0.4, 0.5]}

In [49]:
clf_GS = GridSearchCV(pipe, parameters)
clf_GS.fit(X_train, y_train)

GridSearchCV(estimator=Pipeline(steps=[('dec_tree', DecisionTreeClassifier())]),
             param_grid={'dec_tree__criterion': ['gini', 'entropy'],
                         'dec_tree__max_depth': [6, 8, 10, 12, 20, 30, 40],
                         'dec_tree__min_samples_leaf': [0.1, 0.2, 0.3, 0.4,
                                                        0.5],
                         'dec_tree__min_samples_split': [0.2, 0.4, 0.6, 0.8,
                                                         1.0]})

In [50]:
print('Best criterion:', clf_GS.best_estimator_.get_params()['dec_tree__criterion'])
print('Best max_depth:', clf_GS.best_estimator_.get_params()['dec_tree__max_depth'])
print('Best min_samples_split:', clf_GS.best_estimator_.get_params()['dec_tree__min_samples_split'])
print('Best min_samples_leaf:', clf_GS.best_estimator_.get_params()['dec_tree__min_samples_leaf'])
#print('Best max_features:', clf_GS.best_estimator_.get_params()['dec_tree_max__features'])

print(); print(clf_GS.best_estimator_.get_params()['dec_tree'])

Best criterion: entropy
Best max_depth: 6
Best min_samples_split: 0.2
Best min_samples_leaf: 0.1

DecisionTreeClassifier(criterion='entropy', max_depth=6, min_samples_leaf=0.1,
                       min_samples_split=0.2)


In [51]:
p_criterion=clf_GS.best_estimator_.get_params()['dec_tree__criterion']
p_max_depth=clf_GS.best_estimator_.get_params()['dec_tree__max_depth']
p_min_samples_split=clf_GS.best_estimator_.get_params()['dec_tree__min_samples_split']
p_min_samples_leaf=clf_GS.best_estimator_.get_params()['dec_tree__min_samples_leaf']

# **Training the model with the best hyperparameters configuration**

In [52]:
# Create Decision Tree classifer object
clf = DecisionTreeClassifier(criterion=p_criterion, max_depth=20)

# Train Decision Tree Classifer
clf = clf.fit(X_train,y_train)

#Predict the response for test dataset
y_pred = clf.predict(X_test)


In [53]:
# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.9229761904761905


In [54]:
import matplotlib.pyplot as plt
#plt.figure(figsize=(12,8))
#tree.plot_tree(clf, filled=True, fontsize=10)
#plt.show()

In [55]:
!pip install graphviz

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [56]:
!pip install pydotplus

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [57]:
from sklearn.tree import export_graphviz
from six import StringIO
from IPython.display import Image  
import pydotplus

#dot_data = StringIO()
#export_graphviz(clf, out_file=dot_data,  
                #  filled=True, rounded=True,
                #  special_characters=True)
#graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
#graph.write_png('tree.png')
#Image(graph.create_png())
 


In [58]:
!pip install pickle-mixin


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


# **Saving the final model**

In [59]:
import pickle
pickle.dump(clf, open('/content/drive/MyDrive/ADSDB BASES DE DATOS/model.pkl', 'wb'))