In [46]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.model_selection import train_test_split
import os

In [47]:
# Get the current working directory
current_directory = os.getcwd()

# Go back one level to the parent directory
parent_directory = os.path.dirname(current_directory)

# Open a different folder
desired_folder = "processed_data"  
folder_path = os.path.join(parent_directory, desired_folder)

# Check if the folder exists
if os.path.exists(folder_path) and os.path.isdir(folder_path):
    file_to_read = "filtered_dataset_v2.csv" 
    file_path = os.path.join(folder_path, file_to_read)
    
    # Check if the file exists
    if os.path.exists(file_path) and os.path.isfile(file_path):
        try:
            df = pd.read_csv(file_path)
        except Exception as e:
            print(f"Error reading file: {e}")
    else:
        print(f"File '{file_to_read}' does not exist in folder '{desired_folder}'.")
else:
    print(f"Folder '{desired_folder}' does not exist in the parent directory.")

In [48]:
df.head()

Unnamed: 0.1,Unnamed: 0,Summary,Temperature (C),Apparent Temperature (C),Humidity,Pressure (millibars)
0,4,Mostly Cloudy,8.755556,6.977778,0.83,1016.51
1,5,Partly Cloudy,9.222222,7.111111,0.85,1016.66
2,12,Partly Cloudy,17.8,17.8,0.55,1017.59
3,14,Partly Cloudy,18.877778,18.877778,0.47,1017.17
4,18,Mostly Cloudy,14.255556,14.255556,0.69,1015.82


In [49]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38924 entries, 0 to 38923
Data columns (total 6 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Unnamed: 0                38924 non-null  int64  
 1   Summary                   38924 non-null  object 
 2   Temperature (C)           38924 non-null  float64
 3   Apparent Temperature (C)  38924 non-null  float64
 4   Humidity                  38924 non-null  float64
 5   Pressure (millibars)      38924 non-null  float64
dtypes: float64(4), int64(1), object(1)
memory usage: 1.8+ MB


In [50]:
df['Summary'].value_counts()

Summary
Mostly Cloudy    9731
Partly Cloudy    9731
Overcast         9731
Clear            9731
Name: count, dtype: int64

In [51]:
df.drop(columns=["Unnamed: 0"])

Unnamed: 0,Summary,Temperature (C),Apparent Temperature (C),Humidity,Pressure (millibars)
0,Mostly Cloudy,8.755556,6.977778,0.83,1016.51
1,Partly Cloudy,9.222222,7.111111,0.85,1016.66
2,Partly Cloudy,17.800000,17.800000,0.55,1017.59
3,Partly Cloudy,18.877778,18.877778,0.47,1017.17
4,Mostly Cloudy,14.255556,14.255556,0.69,1015.82
...,...,...,...,...,...
38919,Clear,16.072222,16.072222,0.88,1015.25
38920,Partly Cloudy,28.866667,28.216667,0.37,1015.35
38921,Partly Cloudy,30.894444,29.450000,0.28,1014.66
38922,Partly Cloudy,30.766667,29.311111,0.28,1013.83


In [52]:
# feature_cols = ["Temperature (C)","Apparent Temperature (C)", "Humidity", "Pressure (millibars)"]
feature_cols = ["Temperature (C)", "Humidity", "Pressure (millibars)"]

X = df[feature_cols]
Y = df.Summary

In [53]:
# split X and y into training and testing sets

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, random_state=42)

In [54]:
# from sklearn.preprocessing import StandardScaler

# scaler = StandardScaler()
# X_train_scaled = scaler.fit_transform(X_train)
# X_test_scaled = scaler.transform(X_test)

X_train_scaled = X_train
X_test_scaled = X_test

## Using Logistic Regression

In [55]:
lr = LogisticRegression(random_state=16,solver='saga',max_iter=1000).fit(X_train_scaled, Y_train)



In [56]:
Y_pred = lr.predict(X_test_scaled)

In [57]:
from sklearn import metrics

cnf_matrix_lr = metrics.confusion_matrix(Y_test, Y_pred)
cnf_matrix_lr

array([[ 127,  344,  995,  991],
       [ 122,  312,  900, 1048],
       [ 199,  275, 1635,  389],
       [  94,  290,  651, 1359]], dtype=int64)

In [58]:
lr.score(X_test_scaled,Y_test)

0.35279005240982425

In [59]:
from sklearn.metrics import classification_report
# target_names = ['without diabetes', 'with diabetes']
print(classification_report(Y_test, Y_pred))

               precision    recall  f1-score   support

        Clear       0.23      0.05      0.08      2457
Mostly Cloudy       0.26      0.13      0.17      2382
     Overcast       0.39      0.65      0.49      2498
Partly Cloudy       0.36      0.57      0.44      2394

     accuracy                           0.35      9731
    macro avg       0.31      0.35      0.30      9731
 weighted avg       0.31      0.35      0.30      9731



## Using SVC

One vs One Approach

In [60]:
svc = svm.SVC(decision_function_shape="ovo")
svc.fit(X_train_scaled, Y_train)

In [61]:
Y_pred_svc = svc.predict(X_test_scaled)

cnf_matrix_svc = metrics.confusion_matrix(Y_test, Y_pred_svc)
cnf_matrix_svc

array([[ 843,  162,  503,  949],
       [ 442,  298,  728,  914],
       [ 452,  306, 1428,  312],
       [ 441,  217,  494, 1242]], dtype=int64)

In [62]:
svc.score(X_test_scaled,Y_test)

0.3916349809885932

In [63]:
print(classification_report(Y_test, Y_pred_svc))

               precision    recall  f1-score   support

        Clear       0.39      0.34      0.36      2457
Mostly Cloudy       0.30      0.13      0.18      2382
     Overcast       0.45      0.57      0.51      2498
Partly Cloudy       0.36      0.52      0.43      2394

     accuracy                           0.39      9731
    macro avg       0.38      0.39      0.37      9731
 weighted avg       0.38      0.39      0.37      9731



One vs Rest Approach

In [64]:
lin_svc = svm.LinearSVC(dual="auto").fit(X_train_scaled,Y_train)

In [65]:
Y_pred_lin_svc = svc.predict(X_test_scaled)

cnf_matrix_lin_sv = metrics.confusion_matrix(Y_test, Y_pred_lin_svc)
cnf_matrix_lin_sv

array([[ 843,  162,  503,  949],
       [ 442,  298,  728,  914],
       [ 452,  306, 1428,  312],
       [ 441,  217,  494, 1242]], dtype=int64)

In [66]:
lin_svc.score(X_test_scaled, Y_test)

0.3694378789435824

In [67]:
print(classification_report(Y_test, Y_pred_lin_svc))

               precision    recall  f1-score   support

        Clear       0.39      0.34      0.36      2457
Mostly Cloudy       0.30      0.13      0.18      2382
     Overcast       0.45      0.57      0.51      2498
Partly Cloudy       0.36      0.52      0.43      2394

     accuracy                           0.39      9731
    macro avg       0.38      0.39      0.37      9731
 weighted avg       0.38      0.39      0.37      9731



## Using KNN

In [68]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=4).fit(X_train_scaled,Y_train)

In [69]:
Y_pred_knn = knn.predict(X_test_scaled)

cnf_matrix_knn = metrics.confusion_matrix(Y_test, Y_pred_knn)
cnf_matrix_knn

array([[1119,  467,  417,  454],
       [ 624,  775,  543,  440],
       [ 503,  585, 1154,  256],
       [ 756,  587,  395,  656]], dtype=int64)

In [70]:
knn.score(X_test_scaled, Y_test)

0.3806391943274073

In [71]:
print(classification_report(Y_test, Y_pred_knn))

               precision    recall  f1-score   support

        Clear       0.37      0.46      0.41      2457
Mostly Cloudy       0.32      0.33      0.32      2382
     Overcast       0.46      0.46      0.46      2498
Partly Cloudy       0.36      0.27      0.31      2394

     accuracy                           0.38      9731
    macro avg       0.38      0.38      0.38      9731
 weighted avg       0.38      0.38      0.38      9731



## Using Gaussian Naive Bayes

In [72]:
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB().fit(X_train_scaled, Y_train)

In [73]:
Y_pred_gnb = gnb.predict(X_test_scaled)

cnf_matrix_gnb = metrics.confusion_matrix(Y_test, Y_pred_gnb)
cnf_matrix_knn

array([[1119,  467,  417,  454],
       [ 624,  775,  543,  440],
       [ 503,  585, 1154,  256],
       [ 756,  587,  395,  656]], dtype=int64)

In [74]:
gnb.score(X_test_scaled, Y_test)

0.39317644640838556

In [75]:
print(classification_report(Y_test, Y_pred_gnb))

               precision    recall  f1-score   support

        Clear       0.40      0.18      0.25      2457
Mostly Cloudy       0.30      0.16      0.21      2382
     Overcast       0.40      0.78      0.53      2498
Partly Cloudy       0.42      0.43      0.43      2394

     accuracy                           0.39      9731
    macro avg       0.38      0.39      0.36      9731
 weighted avg       0.38      0.39      0.36      9731



## Using Decision Tree

In [76]:
from sklearn.tree import DecisionTreeClassifier

In [77]:
dt = DecisionTreeClassifier(random_state=0).fit(X_train_scaled, Y_train)

In [78]:
Y_pred_dt = dt.predict(X_test_scaled)

cnf_matrix_dt = metrics.confusion_matrix(Y_test, Y_pred_dt)
cnf_matrix_dt

array([[ 880,  542,  413,  622],
       [ 472,  714,  598,  598],
       [ 432,  575, 1084,  407],
       [ 587,  591,  392,  824]], dtype=int64)

In [79]:
dt.score(X_test_scaled, Y_test)

0.35988079334086936

In [80]:
print(classification_report(Y_test, Y_pred_dt))

               precision    recall  f1-score   support

        Clear       0.37      0.36      0.36      2457
Mostly Cloudy       0.29      0.30      0.30      2382
     Overcast       0.44      0.43      0.43      2498
Partly Cloudy       0.34      0.34      0.34      2394

     accuracy                           0.36      9731
    macro avg       0.36      0.36      0.36      9731
 weighted avg       0.36      0.36      0.36      9731



## Using Random Forest

In [81]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(random_state=0).fit(X_train_scaled, Y_train)

In [82]:
Y_pred_rf = rf.predict(X_test_scaled)

cnf_matrix_rf = metrics.confusion_matrix(Y_test, Y_pred_rf)
cnf_matrix_rf

array([[1040,  405,  403,  609],
       [ 444,  726,  626,  586],
       [ 356,  454, 1404,  284],
       [ 592,  514,  385,  903]], dtype=int64)

In [83]:
rf.score(X_test_scaled, Y_test)

0.4185592436543007

In [84]:
print(classification_report(Y_test, Y_pred_rf))

               precision    recall  f1-score   support

        Clear       0.43      0.42      0.43      2457
Mostly Cloudy       0.35      0.30      0.32      2382
     Overcast       0.50      0.56      0.53      2498
Partly Cloudy       0.38      0.38      0.38      2394

     accuracy                           0.42      9731
    macro avg       0.41      0.42      0.41      9731
 weighted avg       0.41      0.42      0.42      9731

