# Machine Learning Assignment

**Dataset**:       WEATHER CLASSIFICATION


## Imports

Add imports here as needed.

Remember to **re-run the cell when you add imports**, so it gets loaded into the virtual notebook environment!

In [1]:
# Data and Datasets
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.datasets import load_wine
from sklearn.preprocessing import StandardScaler

# Clustering
from sklearn.cluster import DBSCAN

# Validation methods
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import StratifiedKFold

# Metrics
from sklearn import metrics
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

# Classifiers
from sklearn.linear_model import LogisticRegression
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.neighbors import KNeighborsClassifier

# Hyper-parameter optimisation
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

# Feature selection & feature engineering
from sklearn.feature_selection import RFE
from sklearn.feature_selection import RFECV
from sklearn.decomposition import PCA

# Stats
from scipy.stats import randint as sp_randint
from scipy.stats import pearsonr
from scipy.stats import spearmanr
from scipy.stats import shapiro     # Shapiro Wilk
from scipy.stats import normaltest  # D’Agostino’s K^2
from scipy.stats import anderson    # Anderson-Darling
from scipy.stats import ttest_ind    # independent student t-test; assumes normality
from scipy.stats import mannwhitneyu # non-parametric; doesn't assume normality

# Visualisation
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import SVG
from graphviz import Source
from IPython.display import display
from sklearn.tree import export_graphviz

# Utils
import pprint
import numpy as np
from time import time
import openpyxl

## Loading the dataset

In [2]:
import pandas as pd

df = pd.read_csv('data/weather_dataset.csv')

# printing out dataset info
print(df.info())
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13200 entries, 0 to 13199
Data columns (total 10 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   temperature           10392 non-null  float64
 1   humidity              12977 non-null  float64
 2   wind_speed            12967 non-null  float64
 3   precipitation         9099 non-null   float64
 4   cloud_cover           12970 non-null  object 
 5   atmospheric_pressure  12961 non-null  float64
 6   season                12962 non-null  object 
 7   visibility            12968 non-null  float64
 8   location              12968 non-null  object 
 9   weather               13200 non-null  object 
dtypes: float64(6), object(4)
memory usage: 1.0+ MB
None


Unnamed: 0,temperature,humidity,wind_speed,precipitation,cloud_cover,atmospheric_pressure,season,visibility,location,weather
0,14.0,73.0,9.5,82.0,partly cloudy,1010.82,Winter,3.5,inland,Rainy
1,39.0,96.0,8.5,71.0,partly cloudy,1011.43,Spring,10.0,inland,Cloudy
2,,0.0,7.0,,clear,1018.72,Spring,5.5,mountain,Sunny
3,38.0,83.0,1.5,82.0,clear,1026.25,Spring,1.0,coastal,Sunny
4,,74.0,17.0,,overcast,990.67,Winter,2.5,mountain,Rainy


In [3]:
df.describe()

Unnamed: 0,temperature,humidity,wind_speed,precipitation,atmospheric_pressure,visibility
count,10392.0,12977.0,12967.0,9099.0,12961.0,12968.0
mean,19.148768,65.258457,9.829837,53.639081,1005.882164,5.468924
std,17.367761,24.753316,6.906346,31.961687,37.190289,3.371322
min,-25.0,0.0,0.0,0.0,800.12,0.0
25%,4.0,54.0,5.0,19.0,994.81,3.0
50%,21.0,69.0,9.0,58.0,1007.69,5.0
75%,31.0,83.0,13.5,82.0,1016.79,7.5
max,108.0,109.0,48.5,109.0,1199.21,20.0


## Exploratory data analysis

Section for exploratory data analysis, to address **Tasks 1.1 and 1.2**.

**OBS:** You may need to do some data cleaning before you do your full exploratory data analysis, though you will find that some functions we'll cover in this unit are able to handle things like missing values and non-numeric data.

**Create more cells as needed!**

## Task 2.1 - ML Workflow to Critically Evaluate

In [4]:
# Dropping all rows with missing values
df = df.dropna()

# Converting all non-numeric (object) features to numeric
cat_columns = df.select_dtypes(['object']).columns
df[cat_columns] = df[cat_columns].apply(lambda x: x.astype('category')) # converting 'object' columns to 'category' type
df[cat_columns] = df[cat_columns].apply(lambda x: x.cat.codes) # converting the 'category' columns to integer encoded values

In [5]:
# Splits the Pandas DataFrame into a feature matrix (X) and class/label vector (y)
X = df.iloc[:,:len(df.columns)-1]
y = df.iloc[:,len(df.columns)-1]

# Splitting dataset for hold-out validation
X_train, X_test, y_train, y_test = train_test_split(X, # feature matrix
                                                    y, # label vector
                                                    test_size=0.2,
                                                    random_state=1,
                                                    stratify=None
                                                   )

# Creating and testing a Logistic Regression Model
model = RandomForestClassifier()

# Training the model
model.fit(X_train, y_train)

# Testing the model
y_pred = model.predict(X_test)

# Printing out confusion matrix and accuracy
print(metrics.confusion_matrix(y_test, y_pred))
print("\nAccuracy (Testing):  %0.2f " % (metrics.accuracy_score(y_test, y_pred)))

[[311  17  14   9]
 [ 17 337   7   6]
 [ 12   7 321   9]
 [ 19  10  13 351]]

Accuracy (Testing):  0.90 


## Task 2.3 - Evaluation of Improved ML Workflow

Add code for running your **improved** machine learning experiments below.


In [None]:
# Loading the dataset again, for you to do your own pre-processing (instead of what was done above)
df = pd.read_csv('data/weather_dataset.csv')

import math 
for series_name, series in df.items():
    #drops if more than 50% missing values
    missingVals = series.isnull().sum()
    length = len(series)
    percentMissing = (missingVals/length) * 100
    if percentMissing > 50:
        df = df.drop(columns = [series_name])
    if pd.api.types.is_numeric_dtype(series):
      variance = series.var()
      #drop if everything is same
      if variance == 0:
          df = df.drop(columns = [series_name])
      else:
          stdDev = math.sqrt(variance)
          nominal = series.mean()
          df.loc[series.isna(), series_name] = [np.random.randint(nominal - stdDev, nominal + stdDev) for _ in range(series.isna().sum())]
      
df.head()

In [None]:
print("\nAccuracy (Testing):  %0.2f " % (metrics.accuracy_score(y_test, y_pred)))
print("\nPrecision (Testing): %0.2f "%(metrics.precision_score(y_test, y_pred)))
print("\nRecall (Testing):    %0.2f "%(metrics.recall_score(y_test, y_pred)))