In [3]:
from configparser import ConfigParser
import psycopg2
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import recall_score, precision_score, accuracy_score
#Can be very helpful to notice any imbalance in classes
from collections import Counter 
import sweetviz as sv
from matplotlib import pyplot as plt
from sklearn import tree

## Connection

In [4]:
#Sourced from https://www.postgresqltutorial.com/postgresql-python/connect/
def config(filename='database.ini', section='postgresql'):
    parser = ConfigParser()
    # read config file
    parser.read(filename) 
 
    # get section, default to postgresql
    db = {}
    if parser.has_section(section):
        params = parser.items(section)
        for param in params:
            db[param[0]] = param[1]
    else:
        raise Exception('Section {0} not found in the {1} file'.format(section, filename))
 
    return db

In [5]:
#Get the configuration file as a python dictionary
cfg = config()

Exception: Section postgresql not found in the database.ini file

In [None]:
#Establish the connection and create a cursor to the database
try:
    print("Here's an attempt to connect to the database")
    conn = psycopg2.connect(**cfg)
    cursor = conn.cursor()
    print("Look's like it was a success")
    
except (Exception, psycopg2.DatabaseError) as error:
    print(error)

In [None]:
try:
    #Lets get our data 
    cursor.execute('''SELECT p.acquisition_group, p.gender, p.age_group, s.title, f.unresolved::INTEGER, f.resolved::INTEGER,
                    f.fatal::INTEGER, m.retail_and_recreation::INTEGER, m.grocery_and_pharmacy::INTEGER
                    FROM covid19_tracking_fact_table f
                    INNER JOIN onset_date_dimension d
                    ON d.date_surrogate_key = f.onset_date_surrogate_key
                    INNER JOIN patient_dimension p 
                    ON p.patient_surrogate_key = f.patient_surrogate_key
                    INNER JOIN weather_dimension w 
                    ON w.weather_surrogate_key = f.weather_surrogate_key
                    INNER JOIN special_measures_dimension s
                    ON s.special_measures_surrogate_key = f.special_measures_surrogate_key
                    INNER JOIN mobility_dimension m 
                    ON m.mobility_surrogate_key = f.mobility_surrogate_key
                    '''
                  )

    #Get the complete result set. It will be a list of tuples where each tuple is a row from the result set
    result_list = cursor.fetchall()
        
except (Exception, psycopg2.DatabaseError) as error:
    print(error)

In [None]:
#Ensure to run this cell at the end of all your experiments to close all connections
cursor.close()
conn.close()

## Summarization, Preprocessing and Feature Selection

In [None]:
#Now, 
result_df = pd.DataFrame(result_list, columns=['acquisition_group',
                                               'gender',
                                               'age_group',
                                               'special_measure',
                                               'is_unresolved',
                                               'is_resolved',
                                               'is_fatal',
                                               'retail_and_recreation_mobility',
                                               'grocery_and_pharmacy_mobility'])

Let's see how the data looks like

In [None]:
result_df.head()

In [None]:
my_report = sv.analyze(result_df)

In [None]:
my_report.show_notebook(  w=None, 
                h=None, 
                scale=None,
                layout='widescreen',
                filepath=None)

In [None]:
result_df.columns

In [None]:
result_df.drop(result_df[result_df['acquisition_group']=='MISSING INFORMATION'].index, inplace = True)
result_df.drop(result_df[ result_df['age_group']=='UNKNOWN'].index, inplace = True)
result_df.drop(result_df[ result_df['gender'].isin(['UNSPECIFIED', 'GENDER DIVERSE'])].index, inplace = True)

In [None]:
result_df.is_unresolved.value_counts()

In [None]:
df = result_df.drop_duplicates()

In [None]:
print(result_df.is_unresolved.value_counts())
print(result_df.is_resolved.value_counts())
print(result_df.is_fatal.value_counts())

Both labels are highly imbalanced! We need to make sure to consider this when we test our models.

In [None]:
new_result = pd.get_dummies(df, columns=['acquisition_group', 'age_group','gender','special_measure'])
#new_result = pd.get_dummies(df, columns=['age_group','gender','special_measure'])

In [None]:
new_result

In [None]:
y = new_result.iloc[:,0:3]
y

In [None]:
X = new_result.drop(y, axis = 1)
X

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.34, shuffle=True, stratify=y)

In [None]:
# X_train.to_csv('X_train.csv', index=False)
# X_test.to_csv('X_test.csv', index=False)
# y_train.to_csv('y_train.csv', index=False)
# y_test.to_csv('y_test.csv', index=False)

In [None]:
# y_train_1d = pd.DataFrame((y_train.iloc[:, 0:] == 1).idxmax(1), columns = ['Outcome'])
# y_test_1d = pd.DataFrame((y_test.iloc[:, 0:] == 1).idxmax(1), columns = ['Outcome'])
# y_train_1d.to_csv('y_train_1d.csv', index=False)
# y_test_1d.to_csv('y_test_1d.csv', index=False)

### Decision Tree

In [None]:
X_train = pd.read_csv('X_train.csv')
X_test = pd.read_csv('X_test.csv')
y_train = pd.read_csv('y_train.csv')
y_test = pd.read_csv('y_test.csv')

In [None]:
dt = DecisionTreeClassifier(criterion='entropy')
dt.fit(X_train, y_train)

In [None]:
y_pred = dt.predict(X_test)
recall = recall_score(y_pred, y_test, average='micro') * 100
precision = precision_score(y_pred, y_test, average='micro') * 100
print("Recall of Decision Tree {:.2f} %".format(recall))
print("Precision of Decision Tree {:.2f} %".format(precision))

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_pred,y_test)

In [None]:
list(X.columns)

In [None]:
from sklearn.tree import export_text
print(export_text(decision_tree = dt, feature_names = list(X.columns)))

In [None]:
dt.classes_

In [None]:
# dot_data = tree.export_graphviz(dt, out_file="mytree.dot",  
#                      feature_names=X_train.columns,  
#                      class_names=y_train.columns)  

In [None]:
# import graphviz
# graph = graphviz.Source(dot_data)  

In [None]:
# graph

In [None]:
temp = pd.DataFrame(y_pred, columns=['is_unresolved', 'is_resolved', 'is_fatal'])

In [None]:
temp[temp.is_unresolved == 1]

### Random Testing

In [None]:
y_test[y_test.is_fatal==1]

In [None]:
new_pred = new_result[new_result.is_unresolved==1].copy()
new_pred.drop(y, axis = 1, inplace = True)
new_pred

In [None]:
new_y_pred = dt.predict(new_pred)

In [None]:
new_y_pred

### Gradient Boosting 

In [None]:
y_train_1d = pd.read_csv('y_train_1d.csv')
y_test_1d = pd.read_csv('y_test_1d.csv')

In [None]:
gb = GradientBoostingClassifier()
gb.fit(X_train, y_train_1d.values.ravel())

In [None]:
print(y_train_1d.Outcome.value_counts())
print(y_test_1d.Outcome.value_counts())

In [None]:
y_pred = gb.predict(X_test)
recall = recall_score(y_pred, y_test_1d, average = 'micro') * 100
precision = precision_score(y_pred, y_test_1d, average = 'micro') * 100
print("Recall of Decision Tree {:.2f} %".format(recall))
print("Precision of Decision Tree {:.2f} %".format(precision))

### Random Forest Classifier

In [None]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

In [None]:
y_pred = rf.predict(X_test)
recall = recall_score(y_pred, y_test, average='micro') * 100
precision = precision_score(y_pred, y_test, average='micro') * 100
print("Recall of Decision Tree {:.2f} %".format(recall))
print("Precision of Decision Tree {:.2f} %".format(precision))

In [None]:
accuracy_score(y_pred,y_test)