In [51]:
from configparser import ConfigParser
import psycopg2
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import recall_score, precision_score
#Can be very helpful to notice any imbalance in classes
from collections import Counter 
import sweetviz as sv
from matplotlib import pyplot as plt
from sklearn import tree

In this part, we try to connect to PSQL

In [52]:
#Sourced from https://www.postgresqltutorial.com/postgresql-python/connect/
def config(filename='database.ini', section='postgresql'):
    parser = ConfigParser()
    # read config file
    parser.read(filename) 
 
    # get section, default to postgresql
    db = {}
    if parser.has_section(section):
        params = parser.items(section)
        for param in params:
            db[param[0]] = param[1]
    else:
        raise Exception('Section {0} not found in the {1} file'.format(section, filename))
 
    return db

In [53]:
#Get the configuration file as a python dictionary
cfg = config()

In [54]:
#Establish the connection and create a cursor to the database
try:
    print("Here's an attempt to connect to the database")
    conn = psycopg2.connect(**cfg)
    cursor = conn.cursor()
    print("Look's like it was a success")
    
except (Exception, psycopg2.DatabaseError) as error:
    print(error)

Here's an attempt to connect to the database
Look's like it was a success


In [55]:
try:
    #Lets get our data 
    cursor.execute('''SELECT p.acquisition_group, p.gender, w.daily_high_temperature, s.title, f.unresolved::INTEGER, f.resolved::INTEGER,
                    f.fatal::INTEGER, m.retail_and_recreation::INTEGER
                    FROM covid19_tracking_fact_table f
                    INNER JOIN onset_date_dimension d
                    ON d.date_surrogate_key = f.onset_date_surrogate_key
                    INNER JOIN patient_dimension p 
                    ON p.patient_surrogate_key = f.patient_surrogate_key
                    INNER JOIN weather_dimension w 
                    ON w.weather_surrogate_key = f.weather_surrogate_key
                    INNER JOIN special_measures_dimension s
                    ON s.special_measures_surrogate_key = f.special_measures_surrogate_key
                    INNER JOIN mobility_dimension m 
                    ON m.mobility_surrogate_key = f.mobility_surrogate_key
                    '''
                  )

    #Get the complete result set. It will be a list of tuples where each tuple is a row from the result set
    result_list = cursor.fetchall()
        
except (Exception, psycopg2.DatabaseError) as error:
    print(error)

In [56]:
#Ensure to run this cell at the end of all your experiments to close all connections
cursor.close()
conn.close()

Let's prepare the DataFrame.

In [57]:
#Now, 
result_df = pd.DataFrame(result_list, columns=['acquisition_group',
                                               'gender',
                                               'daily_high_temperature',
                                               'special_measure',
                                               'is_unresolved',
                                               'is_resolved',
                                               'is_fatal',
                                               'retail_and_recreation_mobility'])

Let's see how the data looks like

In [58]:
result_df.head()

Unnamed: 0,acquisition_group,gender,daily_high_temperature,special_measure,is_unresolved,is_resolved,is_fatal,retail_and_recreation_mobility
0,CC,MALE,32.0,Stage 2,0,1,0,-18
1,CC,MALE,37.0,Stage 2,0,1,0,-24
2,CC,FEMALE,34.5,Stage 2,0,1,0,-22
3,OB,MALE,32.5,Stage 2,0,1,0,-48
4,CS,FEMALE,37.0,Stage 2,0,1,0,-24


In [59]:
my_report = sv.analyze(result_df)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, layout=Layout(flex='2'), max=9.0), HTML(value='')), la…




In [60]:
my_report.show_notebook(  w=None, 
                h=None, 
                scale=None,
                layout='widescreen',
                filepath=None)

In [61]:
result_df.columns

Index(['acquisition_group', 'gender', 'daily_high_temperature',
       'special_measure', 'is_unresolved', 'is_resolved', 'is_fatal',
       'retail_and_recreation_mobility'],
      dtype='object')

In [62]:
result_df.drop(result_df[result_df['acquisition_group']=='MISSING INFORMATION'].index, inplace = True)
#result_df.drop(result_df[ result_df['age_group']=='UNKNOWN'].index, inplace = True)
result_df.drop(result_df[ result_df['gender'].isin(['UNSPECIFIED', 'GENDER DIVERSE'])].index, inplace = True)

In [63]:
result_df.is_unresolved.value_counts()

0    29722
1        3
Name: is_unresolved, dtype: int64

In [64]:
df = result_df.drop_duplicates()

In [65]:
print(result_df.is_unresolved.value_counts())
print(result_df.is_resolved.value_counts())
print(result_df.is_fatal.value_counts())

0    29722
1        3
Name: is_unresolved, dtype: int64
1    29355
0      370
Name: is_resolved, dtype: int64
0    29358
1      367
Name: is_fatal, dtype: int64


Both labels are highly imbalanced! We need to make sure to consider this when we test our models.

In [66]:
new_result = pd.get_dummies(df, columns=['acquisition_group', 'gender','special_measure'])

In [67]:
new_result

Unnamed: 0,daily_high_temperature,is_unresolved,is_resolved,is_fatal,retail_and_recreation_mobility,acquisition_group_CC,acquisition_group_CS,acquisition_group_OB,acquisition_group_TRAVEL,gender_FEMALE,gender_MALE,special_measure_Stage 2,special_measure_Stage 2 Modified,special_measure_Stage 3,special_measure_Stage 3 Modified
0,32.0,0,1,0,-18,1,0,0,0,0,1,1,0,0,0
1,37.0,0,1,0,-24,1,0,0,0,0,1,1,0,0,0
2,34.5,0,1,0,-22,1,0,0,0,1,0,1,0,0,0
3,32.5,0,1,0,-48,0,0,1,0,0,1,1,0,0,0
4,37.0,0,1,0,-24,0,1,0,0,1,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36563,23.0,0,0,1,-21,0,0,1,0,0,1,0,1,0,0
36686,22.0,0,0,1,-28,0,1,0,0,0,1,0,0,1,0
36745,30.0,0,1,0,-25,0,0,0,1,0,1,0,0,1,0
36809,28.5,0,0,1,-33,0,0,1,0,0,1,1,0,0,0


In [68]:
y = new_result.iloc[:,1:4]
y

Unnamed: 0,is_unresolved,is_resolved,is_fatal
0,0,1,0
1,0,1,0
2,0,1,0
3,0,1,0
4,0,1,0
...,...,...,...
36563,0,0,1
36686,0,0,1
36745,0,1,0
36809,0,0,1


In [69]:
X = new_result.drop(y, axis = 1)
X

Unnamed: 0,daily_high_temperature,retail_and_recreation_mobility,acquisition_group_CC,acquisition_group_CS,acquisition_group_OB,acquisition_group_TRAVEL,gender_FEMALE,gender_MALE,special_measure_Stage 2,special_measure_Stage 2 Modified,special_measure_Stage 3,special_measure_Stage 3 Modified
0,32.0,-18,1,0,0,0,0,1,1,0,0,0
1,37.0,-24,1,0,0,0,0,1,1,0,0,0
2,34.5,-22,1,0,0,0,1,0,1,0,0,0
3,32.5,-48,0,0,1,0,0,1,1,0,0,0
4,37.0,-24,0,1,0,0,1,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
36563,23.0,-21,0,0,1,0,0,1,0,1,0,0
36686,22.0,-28,0,1,0,0,0,1,0,0,1,0
36745,30.0,-25,0,0,0,1,0,1,0,0,1,0
36809,28.5,-33,0,0,1,0,0,1,1,0,0,0


In [70]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, stratify=y)

In [71]:
dt = DecisionTreeClassifier(criterion='entropy')
dt.fit(X_train, y_train)

DecisionTreeClassifier(criterion='entropy')

In [72]:
y_pred = dt.predict(X_test)
recall = recall_score(y_pred, y_test, average='micro') * 100
precision = precision_score(y_pred, y_test, average='micro') * 100
print("Recall of Decision Tree {:.2f} %".format(recall))
print("Precision of Decision Tree {:.2f} %".format(precision))

Recall of Decision Tree 86.80 %
Precision of Decision Tree 84.44 %


In [73]:
from sklearn.metrics import accuracy_score
accuracy_score(y_pred,y_test)

0.8443579766536965

In [74]:
list(X.columns)

['daily_high_temperature',
 'retail_and_recreation_mobility',
 'acquisition_group_CC',
 'acquisition_group_CS',
 'acquisition_group_OB',
 'acquisition_group_TRAVEL',
 'gender_FEMALE',
 'gender_MALE',
 'special_measure_Stage 2',
 'special_measure_Stage 2 Modified',
 'special_measure_Stage 3',
 'special_measure_Stage 3 Modified']

In [75]:
from sklearn.tree import export_text
print(export_text(decision_tree = dt, feature_names = list(X.columns), max_depth=4))

|--- acquisition_group_OB <= 0.50
|   |--- daily_high_temperature <= 20.25
|   |   |--- retail_and_recreation_mobility <= -17.50
|   |   |   |--- acquisition_group_TRAVEL <= 0.50
|   |   |   |   |--- special_measure_Stage 2 Modified <= 0.50
|   |   |   |   |   |--- truncated branch of depth 16
|   |   |   |   |--- special_measure_Stage 2 Modified >  0.50
|   |   |   |   |   |--- truncated branch of depth 14
|   |   |   |--- acquisition_group_TRAVEL >  0.50
|   |   |   |   |--- retail_and_recreation_mobility <= -37.50
|   |   |   |   |   |--- truncated branch of depth 5
|   |   |   |   |--- retail_and_recreation_mobility >  -37.50
|   |   |   |   |   |--- class: 0
|   |   |--- retail_and_recreation_mobility >  -17.50
|   |   |   |--- gender_FEMALE <= 0.50
|   |   |   |   |--- daily_high_temperature <= 14.25
|   |   |   |   |   |--- class: 0
|   |   |   |   |--- daily_high_temperature >  14.25
|   |   |   |   |   |--- truncated branch of depth 8
|   |   |   |--- gender_FEMALE >  0.50
|  