In [18]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.tree import plot_tree
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier

In [3]:
# Path to the downloaded CSV file
data_path = r"C:\Users\ENG WAHEED\Downloads\Confirmed Positive Cases of COVID-19 in Ontario.csv"

# Load the dataset into a DataFrame
df = pd.read_csv(data_path)

# Display the first few rows of the DataFrame
print(df.head())

   Row_ID Accurate_Episode_Date Case_Reported_Date Test_Reported_Date  \
0       1            1934-09-28         2022-09-29         2022-09-29   
1       2            1989-02-21         2022-11-08         2022-11-07   
2       3            2000-03-01         2022-01-30                NaN   
3       4            2002-07-06         2022-07-06         2022-07-07   
4       5            2002-08-08         2022-08-15         2022-08-15   

  Specimen_Date Age_Group Client_Gender Outcome1  Reporting_PHU_ID  \
0    2022-09-27       <20        FEMALE      NaN              2262   
1    2022-11-06       <20        FEMALE      NaN              2270   
2    2000-03-01       <20        FEMALE      NaN              2243   
3    2002-07-06       20s        FEMALE      NaN              2270   
4    2022-08-14       60s          MALE      NaN              2233   

                                      Reporting_PHU  Reporting_PHU_Address  \
0                  Thunder Bay District Health Unit    999 Bal

In [4]:
# Display basic information about the dataset
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1717434 entries, 0 to 1717433
Data columns (total 16 columns):
 #   Column                     Dtype  
---  ------                     -----  
 0   Row_ID                     int64  
 1   Accurate_Episode_Date      object 
 2   Case_Reported_Date         object 
 3   Test_Reported_Date         object 
 4   Specimen_Date              object 
 5   Age_Group                  object 
 6   Client_Gender              object 
 7   Outcome1                   object 
 8   Reporting_PHU_ID           int64  
 9   Reporting_PHU              object 
 10  Reporting_PHU_Address      object 
 11  Reporting_PHU_City         object 
 12  Reporting_PHU_Postal_Code  object 
 13  Reporting_PHU_Website      object 
 14  Reporting_PHU_Latitude     float64
 15  Reporting_PHU_Longitude    float64
dtypes: float64(2), int64(2), object(12)
memory usage: 209.6+ MB
None


In [5]:
# Check for missing values
print(df.isnull().sum())

Row_ID                             0
Accurate_Episode_Date              0
Case_Reported_Date                 0
Test_Reported_Date             53492
Specimen_Date                  12133
Age_Group                          0
Client_Gender                      0
Outcome1                     1698807
Reporting_PHU_ID                   0
Reporting_PHU                      0
Reporting_PHU_Address              0
Reporting_PHU_City                 0
Reporting_PHU_Postal_Code          0
Reporting_PHU_Website              0
Reporting_PHU_Latitude             0
Reporting_PHU_Longitude            0
dtype: int64


In [6]:
# Handle missing values drop the rows
df.dropna(subset=['Test_Reported_Date', 'Specimen_Date'], inplace=True)

# Fill missing Outcome1 with 'Nonfatal'
df['Outcome1'].fillna('Nonfatal', inplace=True)

# Remove duplicate rows
df.drop_duplicates(inplace=True)

# reCheck the missing values
print(df.isnull().sum())

Row_ID                       0
Accurate_Episode_Date        0
Case_Reported_Date           0
Test_Reported_Date           0
Specimen_Date                0
Age_Group                    0
Client_Gender                0
Outcome1                     0
Reporting_PHU_ID             0
Reporting_PHU                0
Reporting_PHU_Address        0
Reporting_PHU_City           0
Reporting_PHU_Postal_Code    0
Reporting_PHU_Website        0
Reporting_PHU_Latitude       0
Reporting_PHU_Longitude      0
dtype: int64


In [7]:
# Convert datetime columns to datetime format
df['Accurate_Episode_Date'] = pd.to_datetime(df['Accurate_Episode_Date'], errors='coerce')
df['Case_Reported_Date'] = pd.to_datetime(df['Case_Reported_Date'], errors='coerce')
df['Test_Reported_Date'] = pd.to_datetime(df['Test_Reported_Date'], errors='coerce')
df['Specimen_Date'] = pd.to_datetime(df['Specimen_Date'], errors='coerce')


In [8]:
# Extract year, month, and day from datetime columns
df['Accurate_Episode_Date_year'] = df['Accurate_Episode_Date'].dt.year
df['Accurate_Episode_Date_month'] = df['Accurate_Episode_Date'].dt.month
df['Accurate_Episode_Date_day'] = df['Accurate_Episode_Date'].dt.day

df['Case_Reported_Date_year'] = df['Case_Reported_Date'].dt.year
df['Case_Reported_Date_month'] = df['Case_Reported_Date'].dt.month
df['Case_Reported_Date_day'] = df['Case_Reported_Date'].dt.day

df['Test_Reported_Date_year'] = df['Test_Reported_Date'].dt.year
df['Test_Reported_Date_month'] = df['Test_Reported_Date'].dt.month
df['Test_Reported_Date_day'] = df['Test_Reported_Date'].dt.day

df['Specimen_Date_year'] = df['Specimen_Date'].dt.year
df['Specimen_Date_month'] = df['Specimen_Date'].dt.month
df['Specimen_Date_day'] = df['Specimen_Date'].dt.day

# Drop the original datetime columns
df.drop(columns=['Accurate_Episode_Date', 'Case_Reported_Date', 'Test_Reported_Date', 'Specimen_Date'], inplace=True)


In [9]:
# Label Encoding for 'Outcome1'
df['Outcome1_Encoded'] = LabelEncoder().fit_transform(df['Outcome1'])

# One-Hot Encoding for 'Age_Group' and 'Client_Gender'
df = pd.get_dummies(df, columns=['Age_Group', 'Client_Gender'])

In [10]:
# Print the columns in the dataframe
print("Columns in the dataframe:", df.columns)

Columns in the dataframe: Index(['Row_ID', 'Outcome1', 'Reporting_PHU_ID', 'Reporting_PHU',
       'Reporting_PHU_Address', 'Reporting_PHU_City',
       'Reporting_PHU_Postal_Code', 'Reporting_PHU_Website',
       'Reporting_PHU_Latitude', 'Reporting_PHU_Longitude',
       'Accurate_Episode_Date_year', 'Accurate_Episode_Date_month',
       'Accurate_Episode_Date_day', 'Case_Reported_Date_year',
       'Case_Reported_Date_month', 'Case_Reported_Date_day',
       'Test_Reported_Date_year', 'Test_Reported_Date_month',
       'Test_Reported_Date_day', 'Specimen_Date_year', 'Specimen_Date_month',
       'Specimen_Date_day', 'Outcome1_Encoded', 'Age_Group_20s',
       'Age_Group_30s', 'Age_Group_40s', 'Age_Group_50s', 'Age_Group_60s',
       'Age_Group_70s', 'Age_Group_80s', 'Age_Group_90+', 'Age_Group_<20',
       'Age_Group_UNKNOWN', 'Client_Gender_FEMALE',
       'Client_Gender_GENDER DIVERSE', 'Client_Gender_MALE',
       'Client_Gender_UNSPECIFIED'],
      dtype='object')


In [11]:
# Define columns to drop if they exist in the dataframe
columns_to_drop = ['Row_ID', 'Outcome1', 'Reporting_PHU', 'Reporting_PHU_Address', 'Reporting_PHU_City', 'Reporting_PHU_Postal_Code', 'Reporting_PHU_Website']
df.drop(columns=[col for col in columns_to_drop if col in df.columns], inplace=True)


In [12]:
# Select features and target variable
features = df.drop(columns=['Outcome1_Encoded'])
target = df['Outcome1_Encoded']

# 1- LogisticRegression

In [17]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Initialize the model
model = LogisticRegression(max_iter=1000)
# Train the model
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)  # Corrected this line

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)  # Corrected this line
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(class_report)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Accuracy: 0.9891572203055693
Confusion Matrix:
[[     0   3608]
 [     0 329148]]
Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00      3608
           1       0.99      1.00      0.99    329148

    accuracy                           0.99    332756
   macro avg       0.49      0.50      0.50    332756
weighted avg       0.98      0.99      0.98    332756



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [16]:
# Increase the Number of Iterations ,Scale the Data and Balance the Classes

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Scale the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Initialize the model with class weights to handle imbalance
model = LogisticRegression(max_iter=2000, class_weight='balanced')
# Train the model
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred, zero_division=1)

print(f'Accuracy: {accuracy}')
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(class_report)

Accuracy: 0.7985701234538221
Confusion Matrix:
[[  3218    390]
 [ 66637 262511]]
Classification Report:
              precision    recall  f1-score   support

           0       0.05      0.89      0.09      3608
           1       1.00      0.80      0.89    329148

    accuracy                           0.80    332756
   macro avg       0.52      0.84      0.49    332756
weighted avg       0.99      0.80      0.88    332756



# 2- Decision Tree

In [21]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.3, random_state=42)

# Initialize the Decision Tree Classifier
decision_tree = DecisionTreeClassifier(random_state=42)

# Train the model
decision_tree.fit(X_train, y_train)

# Make predictions on the test set
y_pred = decision_tree.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print('Confusion Matrix:')
print(conf_matrix)
print('Classification Report:')
print(class_report)


Accuracy: 0.9789214497939427
Confusion Matrix:
[[   707   4634]
 [  5887 487905]]
Classification Report:
              precision    recall  f1-score   support

           0       0.11      0.13      0.12      5341
           1       0.99      0.99      0.99    493792

    accuracy                           0.98    499133
   macro avg       0.55      0.56      0.55    499133
weighted avg       0.98      0.98      0.98    499133



# 3- Random Forest

In [24]:
# Split the data into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.3, random_state=42, stratify=target)

# Initialize the Random Forest model
random_forest = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
random_forest.fit(X_train, y_train)

# Make predictions
y_pred = random_forest.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print('Confusion Matrix:')
print(conf_matrix)
print('Classification Report:')
print(class_report)


Accuracy: 0.9872839503699414
Confusion Matrix:
[[   380   4962]
 [  1385 492406]]
Classification Report:
              precision    recall  f1-score   support

           0       0.22      0.07      0.11      5342
           1       0.99      1.00      0.99    493791

    accuracy                           0.99    499133
   macro avg       0.60      0.53      0.55    499133
weighted avg       0.98      0.99      0.98    499133

