In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [3]:
# Load dataset
file_path = "usa_rain_prediction_dataset_2024_2025.csv"
df = pd.read_csv(file_path)

In [5]:
# 1. Check for Null Values
total_nulls = df.isnull().sum()
print("Null Values in Dataset:\n", total_nulls)

Null Values in Dataset:
 Date             0
Location         0
Temperature      0
Humidity         0
Wind Speed       0
Precipitation    0
Cloud Cover      0
Pressure         0
Rain Tomorrow    0
dtype: int64


In [7]:
# 2. Check for Duplicates
duplicate_count = df.duplicated().sum()
print(f"Total Duplicate Rows: {duplicate_count}")

Total Duplicate Rows: 0


In [9]:
# 3. Display Data Types
print("Data Types of Each Column:\n", df.dtypes)

Data Types of Each Column:
 Date              object
Location          object
Temperature      float64
Humidity         float64
Wind Speed       float64
Precipitation    float64
Cloud Cover      float64
Pressure         float64
Rain Tomorrow      int64
dtype: object


In [11]:
# 4. Convert Date to datetime and extract features
df['Date'] = pd.to_datetime(df['Date'])
df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month
df['Day'] = df['Date'].dt.day
df['Weekday'] = df['Date'].dt.weekday
df.drop(columns=['Date'], inplace=True)

In [15]:
# 5. Encode Location using Label Encoding
encoder = LabelEncoder()
df['Location'] = encoder.fit_transform(df['Location'])

In [17]:
df

Unnamed: 0,Location,Temperature,Humidity,Wind Speed,Precipitation,Cloud Cover,Pressure,Rain Tomorrow,Year,Month,Day,Weekday
0,11,87.524795,75.655455,28.379506,0.000000,69.617966,1026.030278,0,2024,1,1,0
1,11,83.259325,28.712617,12.436433,0.526995,41.606048,995.962065,0,2024,1,2,1
2,11,80.943050,64.740043,14.184831,0.916884,77.364763,980.796739,1,2024,1,3,2
3,11,78.097552,59.738984,19.444029,0.094134,52.541196,979.012163,0,2024,1,4,3
4,11,37.059963,34.766784,3.689661,1.361272,85.584000,1031.790859,0,2024,1,5,4
...,...,...,...,...,...,...,...,...,...,...,...,...
73095,19,40.614393,65.099438,28.778327,0.000000,54.168514,977.083747,0,2025,12,27,5
73096,19,52.641643,30.610525,12.282890,0.871000,22.068055,980.591675,0,2025,12,28,6
73097,19,56.492591,96.740232,2.894762,1.191956,52.336048,1016.469174,1,2025,12,29,0
73098,19,65.748956,63.900004,24.632400,0.483421,76.785280,1032.396146,1,2025,12,30,1


In [21]:
# 6. Detect and Visualize Outliers
# numeric_features = ['Temperature', 'Humidity', 'Wind Speed', 'Precipitation', 'Cloud Cover', 'Pressure']
# plt.figure(figsize=(12, 8))
# for i, col in enumerate(numeric_features, 1):
#     plt.subplot(2, 3, i)
#     sns.boxplot(y=df[col])
#     plt.title(f'Outliers in {col}')
# plt.tight_layout()
# plt.show()

In [23]:
# 7. Visualize Distribution of Numerical Features
# plt.figure(figsize=(12, 8))
# for i, col in enumerate(numeric_features, 1):
#     plt.subplot(2, 3, i)
#     sns.histplot(df[col], kde=True, bins=30)
#     plt.title(f'Distribution of {col}')
# plt.tight_layout()
# plt.show()

In [25]:
# 8. Visualize Correlation Heatmap
# plt.figure(figsize=(10, 6))
# sns.heatmap(df.corr(), annot=True, cmap='coolwarm', fmt='.2f')
# plt.title("Feature Correlation Heatmap")
# plt.show()

In [27]:
# 9. Split into Features and Target
X = df.drop(columns=['Rain Tomorrow'])
y = df['Rain Tomorrow']

In [35]:
# 10. Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [37]:
# 11. Create Pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])

In [39]:
# 12. Train the Model
pipeline.fit(X_train, y_train)

In [41]:
# 13. Make Predictions
y_pred = pipeline.predict(X_test)

In [43]:
# 14. Evaluate the Model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

In [45]:
print(f"Model Accuracy: {accuracy:.4f}")
print("Classification Report:\n", report)

Model Accuracy: 1.0000
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     11398
           1       1.00      1.00      1.00      3222

    accuracy                           1.00     14620
   macro avg       1.00      1.00      1.00     14620
weighted avg       1.00      1.00      1.00     14620



In [47]:
# Save processed data and predictions
X_train.to_csv("X_train.csv", index=False)
X_test.to_csv("X_test.csv", index=False)
y_train.to_csv("y_train.csv", index=False)
y_test.to_csv("y_test.csv", index=False)
pd.DataFrame(y_pred, columns=['Predicted Rain']).to_csv("predictions.csv", index=False)

print("Preprocessing, training, and prediction complete. Results saved.")

Preprocessing, training, and prediction complete. Results saved.


In [49]:
import pickle
with open("rain_predict.pkl", "wb") as file:
    pickle.dump(pipeline, file)

print("✅ Model trained and saved successfully!")

✅ Model trained and saved successfully!
