# Data Cleaning and Visualisation

EDA process includes:

- Removing Duplicates
- Fixing Values
- Normalizing if needed
- Cleaning Outliers
- Looking a feature distribution
- Seperating into categorical and numerical columns
- Visualizing Data

In [11]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
import joblib
import pickle

In [12]:
file_path = 'dataset/weather_classification_data.csv'
df = pd.read_csv(file_path)
df.head()

Unnamed: 0,Temperature,Humidity,Wind Speed,Precipitation (%),Cloud Cover,Atmospheric Pressure,UV Index,Season,Visibility (km),Location,Weather Type
0,14.0,73,9.5,82.0,partly cloudy,1010.82,2,Winter,3.5,inland,Rainy
1,39.0,96,8.5,71.0,partly cloudy,1011.43,7,Spring,10.0,inland,Cloudy
2,30.0,64,7.0,16.0,clear,1018.72,5,Spring,5.5,mountain,Sunny
3,38.0,83,1.5,82.0,clear,1026.25,7,Spring,1.0,coastal,Sunny
4,27.0,74,17.0,66.0,overcast,990.67,1,Winter,2.5,mountain,Rainy


In [13]:
df.shape

(13200, 11)

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13200 entries, 0 to 13199
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Temperature           13200 non-null  float64
 1   Humidity              13200 non-null  int64  
 2   Wind Speed            13200 non-null  float64
 3   Precipitation (%)     13200 non-null  float64
 4   Cloud Cover           13200 non-null  object 
 5   Atmospheric Pressure  13200 non-null  float64
 6   UV Index              13200 non-null  int64  
 7   Season                13200 non-null  object 
 8   Visibility (km)       13200 non-null  float64
 9   Location              13200 non-null  object 
 10  Weather Type          13200 non-null  object 
dtypes: float64(5), int64(2), object(4)
memory usage: 1.1+ MB


In [15]:
# Checking for Null Values
null_check = df.isnull().sum().sum()
if null_check == 0:
    print("No Null Values Found")
else:
    print("Null Values Found and Dropped")
    df.dropna()

No Null Values Found


In [16]:
# Checking for duplicates
if df.duplicated().sum() != 0:
    df.drop_duplicates()
    print("Duplicates Dropped")
else:
    print("No Duplicates Found")

No Duplicates Found


In [17]:
# Check if all values are numeric
is_numeric = df.applymap(lambda x: pd.to_numeric(x, errors='coerce')).notnull().all().all()
if is_numeric:
    print("All values in the dataset are numeric.")
else:
    print("There are non-numeric values in the dataset.")

There are non-numeric values in the dataset.


  is_numeric = df.applymap(lambda x: pd.to_numeric(x, errors='coerce')).notnull().all().all()


In [18]:
df.describe()

Unnamed: 0,Temperature,Humidity,Wind Speed,Precipitation (%),Atmospheric Pressure,UV Index,Visibility (km)
count,13200.0,13200.0,13200.0,13200.0,13200.0,13200.0,13200.0
mean,19.127576,68.710833,9.832197,53.644394,1005.827896,4.005758,5.462917
std,17.386327,20.194248,6.908704,31.946541,37.199589,3.8566,3.371499
min,-25.0,20.0,0.0,0.0,800.12,0.0,0.0
25%,4.0,57.0,5.0,19.0,994.8,1.0,3.0
50%,21.0,70.0,9.0,58.0,1007.65,3.0,5.0
75%,31.0,84.0,13.5,82.0,1016.7725,7.0,7.5
max,109.0,109.0,48.5,109.0,1199.21,14.0,20.0


In [19]:
categorical_features=df.select_dtypes(exclude='number').columns
print(f"Categorical Columns: {categorical_features}")

Categorical Columns: Index(['Cloud Cover', 'Season', 'Location', 'Weather Type'], dtype='object')
