# IMPORTING LIBARARIES

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# IMPORTING DATASET & PREPROCESSING

In [2]:
data = pd.read_csv('/content/Rainfall.csv')
x = data.iloc[:,[1,2,3,4,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21]].values
y = data.iloc[:,-1].values

In [3]:
print(x)

[['Albury' 13.4 22.9 ... 16.9 21.8 'No']
 ['Albury' 7.4 25.1 ... 17.2 24.3 'No']
 ['Albury' 12.9 25.7 ... 21.0 23.2 'No']
 ...
 ['Uluru' 5.4 26.9 ... 12.5 26.1 'No']
 ['Uluru' 7.8 27.0 ... 15.1 26.0 'No']
 ['Uluru' 14.9 nan ... 15.0 20.9 'No']]


In [4]:
print(y)

['No' 'No' 'No' ... 'No' 'No' nan]


In [5]:
Y = y.reshape(-1,1) # 1D list into 2D list

In [6]:
print(Y)

[['No']
 ['No']
 ['No']
 ...
 ['No']
 ['No']
 [nan]]


# DEALING WITH INVALID DATASET

In [7]:
from sklearn.impute import SimpleImputer
import numpy as np

imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
x = imputer.fit_transform(x)

# Reshape y to a 2D array before imputation
y = y.reshape(-1, 1)  # Reshape to a column vector
y = imputer.fit_transform(y)

In [8]:
print(x)

[['Albury' 13.4 22.9 ... 16.9 21.8 'No']
 ['Albury' 7.4 25.1 ... 17.2 24.3 'No']
 ['Albury' 12.9 25.7 ... 21.0 23.2 'No']
 ...
 ['Uluru' 5.4 26.9 ... 12.5 26.1 'No']
 ['Uluru' 7.8 27.0 ... 15.1 26.0 'No']
 ['Uluru' 14.9 19.0 ... 15.0 20.9 'No']]


In [9]:
print(y)

[['No']
 ['No']
 ['No']
 ...
 ['No']
 ['No']
 ['No']]


# ENCODING DATASET

In [10]:
from sklearn.preprocessing import LabelEncoder
le1 = LabelEncoder()
x[:,0] = le1.fit_transform(x[:,0])
le2 = LabelEncoder()
x[:,4] = le2.fit_transform(x[:,4])
le3 = LabelEncoder()
x[:,6] = le3.fit_transform(x[:,6])
le4 = LabelEncoder()
x[:,7] = le4.fit_transform(x[:,7])
le5 = LabelEncoder()
x[:,-1] = le5.fit_transform(x[:,-1])
le6 = LabelEncoder()
Y = le6.fit_transform(Y)

  y = column_or_1d(y, warn=True)


In [11]:
print(x)

[[1 13.4 22.9 ... 16.9 21.8 0]
 [1 7.4 25.1 ... 17.2 24.3 0]
 [1 12.9 25.7 ... 21.0 23.2 0]
 ...
 [9 5.4 26.9 ... 12.5 26.1 0]
 [9 7.8 27.0 ... 15.1 26.0 0]
 [9 14.9 19.0 ... 15.0 20.9 0]]


In [12]:
print(Y)

[0 0 0 ... 0 0 2]


# FEATURE SCALING

In [13]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x = sc.fit_transform(x)

In [14]:
print(x)

[[-1.51577598  0.04039106 -0.15101108 ... -0.15006358 -0.10175254
  -0.55609919]
 [-1.51577598 -1.07478085  0.20800875 ... -0.09661746  0.318031
  -0.55609919]
 [-1.51577598 -0.05253993  0.30592325 ...  0.58036664  0.13332624
  -0.55609919]
 ...
 [ 1.01389522 -1.44650482  0.50175225 ... -0.93393991  0.62027515
  -0.55609919]
 [ 1.01389522 -1.00043606  0.51807133 ... -0.47074026  0.60348381
  -0.55609919]
 [ 1.01389522  0.31918404 -0.78745533 ... -0.48855563 -0.25287462
  -0.55609919]]


# SPLITTING THE DATASET INTO TRAINING & TESTING SET

In [15]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, Y, test_size = 0.2, random_state = 0)

In [16]:
print(y_train)

[0 0 0 ... 1 0 1]


# TRAINING MODEL

In [17]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators=100,random_state=0)
classifier.fit(x_train,y_train)

In [18]:
classifier.score(x_train,y_train)

0.9994065281899109

In [19]:
print(y_test)

[1 0 0 ... 1 0 0]


In [20]:
y_test = y_test.reshape(-1,1)
print(y_test)

[[1]
 [0]
 [0]
 ...
 [1]
 [0]
 [0]]


In [21]:
y_pred = classifier.predict(x_test)

In [22]:
print(y_pred)

[0 0 0 ... 1 0 0]


In [24]:
y_pred = le6.inverse_transform(y_pred)

In [25]:
print(y_pred)

['No' 'No' 'No' ... 'Yes' 'No' 'No']


In [26]:
print(y_test)

[[1]
 [0]
 [0]
 ...
 [1]
 [0]
 [0]]


In [27]:
y_test = le6.inverse_transform(y_test)

  y = column_or_1d(y, warn=True)


In [28]:
print(y_test)

['Yes' 'No' 'No' ... 'Yes' 'No' 'No']


In [29]:
y_test = y_test.reshape(-1,1)
y_pred = y_pred.reshape(-1,1)

In [31]:
data = np.concatenate((y_test,y_pred),axis=1)
data = pd.DataFrame(data,columns=['Rain on Tomorrow','Prediction of Rain'])

In [40]:
print(y_test,y_pred)

[['Yes']
 ['No']
 ['No']
 ...
 ['Yes']
 ['No']
 ['No']] [['No']
 ['No']
 ['No']
 ...
 ['Yes']
 ['No']
 ['No']]


In [32]:
print(data)

     Rain on Tomorrow Prediction of Rain
0                 Yes                 No
1                  No                 No
2                  No                 No
3                  No                 No
4                 Yes                Yes
...               ...                ...
1680              Yes                 No
1681               No                 No
1682              Yes                Yes
1683               No                 No
1684               No                 No

[1685 rows x 2 columns]


In [41]:
data.to_csv('prediction.csv')

# CALCULATING ACCURACY

In [44]:
# Check the data types of your arrays
print(y_test.dtype)
print(y_pred.dtype)

# If either contains strings, convert them to a numerical representation
import numpy as np
y_test_numeric = np.where(y_test == 'Yes', 1, 0) # Assuming 'Yes' maps to 1 and other values to 0
y_pred_numeric = np.where(y_pred == 'Yes', 1, 0)

from sklearn.metrics import accuracy_score
accuracy_score(y_test_numeric, y_pred_numeric)

object
object


0.9008902077151335

In [45]:
data.to_csv('prediction.csv')

#CONCLUSION
- THEIR COULD BE 90% OF RAIN
- THEIR WILL BE "NO" RAIN TOMORROW