In [4]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

url='weatherAUS.csv'
df1=pd.read_csv(url,parse_dates=True)

In [5]:
data1=df1.copy()

In [6]:
data1.head()

Unnamed: 0,WindDir3pm,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Temp3pm,RainTomorrow
0,WNW,20.0,24.0,71.0,22.0,1007.7,1007.1,21.8,No
1,WSW,4.0,22.0,44.0,25.0,1010.6,1007.8,24.3,No
2,WSW,19.0,26.0,38.0,30.0,1007.6,1008.7,23.2,No
3,E,11.0,9.0,45.0,16.0,1017.6,1012.8,26.5,No
4,NW,7.0,20.0,82.0,33.0,1010.8,1006.0,29.7,No


In [7]:
data1.dropna(subset=['RainTomorrow'] ,inplace=True)

In [8]:
import numpy as np
data1['RainTomorrow']=data1['RainTomorrow'].replace({
    'Yes':1,
    'No':0
}).astype('int64')

In [9]:
data1.head()

Unnamed: 0,WindDir3pm,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Temp3pm,RainTomorrow
0,WNW,20.0,24.0,71.0,22.0,1007.7,1007.1,21.8,0
1,WSW,4.0,22.0,44.0,25.0,1010.6,1007.8,24.3,0
2,WSW,19.0,26.0,38.0,30.0,1007.6,1008.7,23.2,0
3,E,11.0,9.0,45.0,16.0,1017.6,1012.8,26.5,0
4,NW,7.0,20.0,82.0,33.0,1010.8,1006.0,29.7,0


In [10]:
data1.isna().sum()

WindDir3pm       3778
WindSpeed9am     1348
WindSpeed3pm     2630
Humidity9am      1774
Humidity3pm      3610
Pressure9am     14014
Pressure3pm     13981
Temp3pm          2726
RainTomorrow        0
dtype: int64

In [11]:
data1['WindDir3pm']=data1['WindDir3pm'].fillna(data1['WindDir3pm'].mode()[0])

In [12]:
data1.isna().sum()

WindDir3pm          0
WindSpeed9am     1348
WindSpeed3pm     2630
Humidity9am      1774
Humidity3pm      3610
Pressure9am     14014
Pressure3pm     13981
Temp3pm          2726
RainTomorrow        0
dtype: int64

In [13]:
data1.sample(5)

Unnamed: 0,WindDir3pm,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Temp3pm,RainTomorrow
11786,SW,19.0,24.0,87.0,78.0,1017.6,1017.2,22.4,1
8235,NE,13.0,13.0,81.0,36.0,1031.8,1028.7,15.8,0
46957,NNW,26.0,24.0,55.0,39.0,1019.0,1017.0,24.6,0
141433,E,20.0,24.0,56.0,24.0,1027.6,1022.5,18.7,0
72393,WNW,11.0,20.0,94.0,71.0,1025.4,1025.0,14.6,0


## 1- Handling Missing Values

In [14]:
data1['WindSpeed9am']=data1['WindSpeed9am'].fillna(data1['WindSpeed9am'].mean())

In [15]:
data1['WindSpeed3pm']=data1['WindSpeed3pm'].fillna(data1['WindSpeed3pm'].median())

In [16]:
#using backfill, bfill, pad, ffill and none

In [17]:
data1['WindSpeed3pm']=data1['WindSpeed3pm'].fillna(method='backfill')

In [18]:
data1['Humidity9am']=data1['Humidity9am'].fillna(method='bfill')

In [19]:
data1['Humidity3pm']=data1['Humidity3pm'].fillna(method='bfill')

In [20]:
data1['Pressure9am']=data1['Pressure9am'].fillna(method='pad')

In [21]:
data1['Pressure3pm']=data1['Pressure3pm'].fillna(method='ffill')

In [22]:
data1['Temp3pm']=data1['Temp3pm'].fillna(data1['Temp3pm'].mean())

In [23]:
data1.isna().sum()

WindDir3pm      0
WindSpeed9am    0
WindSpeed3pm    0
Humidity9am     0
Humidity3pm     0
Pressure9am     0
Pressure3pm     0
Temp3pm         0
RainTomorrow    0
dtype: int64

In [24]:
#splitting the data
x=data1.drop(['RainTomorrow'], axis=1)
y=data1.RainTomorrow

In [25]:
from sklearn.preprocessing import StandardScaler,LabelEncoder
scale=StandardScaler()
encode=LabelEncoder()

In [26]:
data1_encode=data1.copy()

In [27]:
x_en=data1.copy()

In [28]:
#for categorical column handeling it with Label Encoder. We can also use pd.getdummies
x_en['WindDir3pm']=encode.fit_transform(x_en['WindDir3pm'])

In [29]:
x_en.head()

Unnamed: 0,WindDir3pm,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Temp3pm,RainTomorrow
0,14,20.0,24.0,71.0,22.0,1007.7,1007.1,21.8,0
1,15,4.0,22.0,44.0,25.0,1010.6,1007.8,24.3,0
2,15,19.0,26.0,38.0,30.0,1007.6,1008.7,23.2,0
3,0,11.0,9.0,45.0,16.0,1017.6,1012.8,26.5,0
4,7,7.0,20.0,82.0,33.0,1010.8,1006.0,29.7,0


In [30]:
#normalization with the help of standard scaler
x_scale=scale.fit_transform(x_en)

In [31]:
x_scale

array([[ 1.36627749,  0.67766103,  0.61407145, ..., -1.22505083,
         0.01641234, -0.53755075],
       [ 1.58623629, -1.13003399,  0.38475724, ..., -1.12326063,
         0.38027381, -0.53755075],
       [ 1.58623629,  0.56468009,  0.84338567, ..., -0.99238752,
         0.22017476, -0.53755075],
       ...,
       [-1.05326936, -0.11320554, -1.10578517, ...,  0.519924  ,
         0.40938273, -0.53755075],
       [ 1.36627749, -0.5651293 , -1.10578517, ...,  0.18547049,
         0.64225407, -0.53755075],
       [-1.05326936, -0.11320554, -1.33509939, ...,  0.14184612,
         0.62769961, -0.53755075]])

In [32]:
from sklearn import model_selection,linear_model,metrics

In [33]:
#splitting through datasaet
x_train,x_test,y_train,y_test=model_selection.train_test_split(x_scale,y,test_size=.30)

In [34]:
model=linear_model.LogisticRegression()

In [35]:
model=model.fit(x_train,y_train)

In [37]:
#making the predictions
y_pred=model.predict(x_test)

In [39]:
metrics.accuracy_score(y_test,y_pred)

1.0

In [41]:
print(metrics.classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     33240
           1       1.00      1.00      1.00      9418

    accuracy                           1.00     42658
   macro avg       1.00      1.00      1.00     42658
weighted avg       1.00      1.00      1.00     42658

