## 1. Import necessary libraries

In [1]:
#importing liabrary
import pandas as pd

## 2. Import data

In [2]:
#importing the data
weather_report = pd.read_csv('data_clean.csv') # Yearwise weather report
weather_report.head(50)

Unnamed: 0.1,Unnamed: 0,Ozone,Solar.R,Wind,Temp C,Month,Day,Year,Temp,Weather
0,1,41.0,190.0,7.4,67,5,1,2010,67,S
1,2,36.0,118.0,8.0,72,5,2,2010,72,C
2,3,12.0,149.0,12.6,74,5,3,2010,74,PS
3,4,18.0,313.0,11.5,62,5,4,2010,62,S
4,5,,,14.3,56,5,5,2010,56,S
5,6,28.0,,14.9,66,5,6,2010,66,C
6,7,23.0,299.0,8.6,65,5,7,2010,65,PS
7,8,19.0,99.0,13.8,59,5,8,2010,59,C
8,9,8.0,19.0,20.1,61,5,9,2010,61,PS
9,10,,194.0,8.6,69,5,10,2010,69,S


## 3. Initial Investigation/Data Understanding

In [3]:
weather_report.dtypes

Unnamed: 0      int64
Ozone         float64
Solar.R       float64
Wind          float64
Temp C         object
Month          object
Day             int64
Year            int64
Temp            int64
Weather        object
dtype: object

In [4]:
#checking null values 
weather_report.isnull().sum()

Unnamed: 0     0
Ozone         38
Solar.R        7
Wind           0
Temp C         0
Month          0
Day            0
Year           0
Temp           0
Weather        3
dtype: int64

In [5]:
weather_report.shape

(158, 10)

## 4. Data Preparation/Preprocessing

* Data Cleaning - dropna, mean, median, mode Imputation, replace
* Data Transformation
        -  If data is continous - Standardization/Normalization, MinMaxScaler
        -  If data is discrete  - Label Encoding, One Hot Encoding
* Datatype Conversation - ie, from Object to int or float or datetime

In [6]:
#Dropping Unnamed: 0 and Ozone column
weather_report.drop(labels=['Unnamed: 0','Ozone'],axis=1,inplace=True)
weather_report

Unnamed: 0,Solar.R,Wind,Temp C,Month,Day,Year,Temp,Weather
0,190.0,7.4,67,5,1,2010,67,S
1,118.0,8.0,72,5,2,2010,72,C
2,149.0,12.6,74,5,3,2010,74,PS
3,313.0,11.5,62,5,4,2010,62,S
4,,14.3,56,5,5,2010,56,S
...,...,...,...,...,...,...,...,...
153,190.0,7.4,67,5,1,2010,67,C
154,193.0,6.9,70,9,26,2010,70,PS
155,145.0,13.2,77,9,27,2010,77,S
156,191.0,14.3,75,9,28,2010,75,S


In [7]:
weather_report.isnull().sum()

Solar.R    7
Wind       0
Temp C     0
Month      0
Day        0
Year       0
Temp       0
Weather    3
dtype: int64

In [8]:
weather_report.describe(include = 'all')

Unnamed: 0,Solar.R,Wind,Temp C,Month,Day,Year,Temp,Weather
count,151.0,158.0,158.0,158.0,158.0,158.0,158.0,155
unique,,,41.0,6.0,,,,3
top,,,81.0,9.0,,,,S
freq,,,11.0,34.0,,,,59
mean,185.403974,9.957595,,,16.006329,2010.0,77.727848,
std,88.723103,3.511261,,,8.997166,0.0,9.377877,
min,7.0,1.7,,,1.0,2010.0,56.0,
25%,119.0,7.4,,,8.0,2010.0,72.0,
50%,197.0,9.7,,,16.0,2010.0,78.5,
75%,257.0,11.875,,,24.0,2010.0,84.0,


In [9]:
#checking median of Solar.R
weather_report['Solar.R'].median()

197.0

In [10]:
#filling nan values by median
weather_report['Solar.R'] = weather_report['Solar.R'].fillna(value=weather_report['Solar.R'].median(), axis=0)

In [11]:
weather_report.isnull().sum()

Solar.R    0
Wind       0
Temp C     0
Month      0
Day        0
Year       0
Temp       0
Weather    3
dtype: int64

In [12]:
weather_report.dtypes

Solar.R    float64
Wind       float64
Temp C      object
Month       object
Day          int64
Year         int64
Temp         int64
Weather     object
dtype: object

In [13]:
#converting the data types 
weather_report['Temp C'] = pd.to_numeric(weather_report['Temp C'], errors='coerce')
weather_report['Month']  = pd.to_numeric(weather_report['Month'], errors='coerce')

In [14]:
weather_report.head(50)

Unnamed: 0,Solar.R,Wind,Temp C,Month,Day,Year,Temp,Weather
0,190.0,7.4,67.0,5.0,1,2010,67,S
1,118.0,8.0,72.0,5.0,2,2010,72,C
2,149.0,12.6,74.0,5.0,3,2010,74,PS
3,313.0,11.5,62.0,5.0,4,2010,62,S
4,197.0,14.3,56.0,5.0,5,2010,56,S
5,197.0,14.9,66.0,5.0,6,2010,66,C
6,299.0,8.6,65.0,5.0,7,2010,65,PS
7,99.0,13.8,59.0,5.0,8,2010,59,C
8,19.0,20.1,61.0,5.0,9,2010,61,PS
9,194.0,8.6,69.0,5.0,10,2010,69,S


In [15]:
#describing the all values
weather_report.describe()

Unnamed: 0,Solar.R,Wind,Temp C,Month,Day,Year,Temp
count,158.0,158.0,157.0,157.0,158.0,158.0,158.0
mean,185.917722,9.957595,77.751592,7.044586,16.006329,2010.0,77.727848
std,86.755676,3.511261,9.40312,1.433768,8.997166,0.0,9.377877
min,7.0,1.7,56.0,5.0,1.0,2010.0,56.0
25%,127.0,7.4,72.0,6.0,8.0,2010.0,72.0
50%,197.0,9.7,79.0,7.0,16.0,2010.0,78.5
75%,255.0,11.875,84.0,8.0,24.0,2010.0,84.0
max,334.0,20.7,97.0,9.0,31.0,2010.0,97.0


In [16]:
#checking mean of Temp C
weather_report['Temp C'].mean()

77.7515923566879

In [17]:
#filling nan values by mean
weather_report['Temp C'] = weather_report['Temp C'].fillna(value=weather_report['Temp C'].mean(),axis=0)

In [18]:
#filling nan values in month column
weather_report['Month'] = weather_report['Month'].fillna(value=5,axis=0)

In [19]:
weather_report.head(50)

Unnamed: 0,Solar.R,Wind,Temp C,Month,Day,Year,Temp,Weather
0,190.0,7.4,67.0,5.0,1,2010,67,S
1,118.0,8.0,72.0,5.0,2,2010,72,C
2,149.0,12.6,74.0,5.0,3,2010,74,PS
3,313.0,11.5,62.0,5.0,4,2010,62,S
4,197.0,14.3,56.0,5.0,5,2010,56,S
5,197.0,14.9,66.0,5.0,6,2010,66,C
6,299.0,8.6,65.0,5.0,7,2010,65,PS
7,99.0,13.8,59.0,5.0,8,2010,59,C
8,19.0,20.1,61.0,5.0,9,2010,61,PS
9,194.0,8.6,69.0,5.0,10,2010,69,S


In [20]:
#checking data types
weather_report.dtypes

Solar.R    float64
Wind       float64
Temp C     float64
Month      float64
Day          int64
Year         int64
Temp         int64
Weather     object
dtype: object

In [21]:
#checking null values
weather_report.isnull().sum()

Solar.R    0
Wind       0
Temp C     0
Month      0
Day        0
Year       0
Temp       0
Weather    3
dtype: int64

In [22]:
#converting float into integer
weather_report.Month = weather_report.Month.astype('int')

In [23]:
#checking data types
weather_report.dtypes

Solar.R    float64
Wind       float64
Temp C     float64
Month        int32
Day          int64
Year         int64
Temp         int64
Weather     object
dtype: object

In [24]:
#describing the all values 
weather_report.describe(include = 'all')

Unnamed: 0,Solar.R,Wind,Temp C,Month,Day,Year,Temp,Weather
count,158.0,158.0,158.0,158.0,158.0,158.0,158.0,155
unique,,,,,,,,3
top,,,,,,,,S
freq,,,,,,,,59
mean,185.917722,9.957595,77.751592,7.031646,16.006329,2010.0,77.727848,
std,86.755676,3.511261,9.373126,1.438421,8.997166,0.0,9.377877,
min,7.0,1.7,56.0,5.0,1.0,2010.0,56.0,
25%,127.0,7.4,72.0,6.0,8.0,2010.0,72.0,
50%,197.0,9.7,78.5,7.0,16.0,2010.0,78.5,
75%,255.0,11.875,84.0,8.0,24.0,2010.0,84.0,


In [25]:
#checking unique values
weather_report.Weather.unique()

array(['S', 'C', 'PS', nan], dtype=object)

In [26]:
#checking mode of target
weather_report.Weather.mode()[0]

'S'

In [27]:
#filling null values by mean
weather_report['Weather'] = weather_report['Weather'].fillna(value=weather_report.Weather.mode()[0],axis=0)

In [28]:
#checking null values
weather_report.isnull().sum()

Solar.R    0
Wind       0
Temp C     0
Month      0
Day        0
Year       0
Temp       0
Weather    0
dtype: int64

In [29]:
#importing Onehot Encoding and Label Encoding for converting categorical into numeric
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
le = LabelEncoder()
weather_report['Weather'] = le.fit_transform(weather_report['Weather'])

In [30]:
#reading first fifty record
weather_report.head(50)

Unnamed: 0,Solar.R,Wind,Temp C,Month,Day,Year,Temp,Weather
0,190.0,7.4,67.0,5,1,2010,67,2
1,118.0,8.0,72.0,5,2,2010,72,0
2,149.0,12.6,74.0,5,3,2010,74,1
3,313.0,11.5,62.0,5,4,2010,62,2
4,197.0,14.3,56.0,5,5,2010,56,2
5,197.0,14.9,66.0,5,6,2010,66,0
6,299.0,8.6,65.0,5,7,2010,65,1
7,99.0,13.8,59.0,5,8,2010,59,0
8,19.0,20.1,61.0,5,9,2010,61,1
9,194.0,8.6,69.0,5,10,2010,69,2


### >>>>>>>>>>>>>>>>>>>>>>>> Data Cleaning Completed <<<<<<<<<<<<<<<<<<<<<<<<<<<<<