In [1]:
# Import necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load datasets
train_path = "train.csv"  # Update path if needed
test_path = "test.csv"
submission_path = "sample_submission.csv"

train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

# Display first few rows
print("Train Data Preview:")
print(train_df.head())
print("\nTest Data Preview:")
print(test_df.head())

# Check for missing values
print("\nMissing Values in Train Data:")
print(train_df.isnull().sum())

print("\nMissing Values in Test Data:")
print(test_df.isnull().sum())

# Handle missing values in 'winddirection' (test dataset) - FIXED METHOD
most_frequent_winddir = test_df['winddirection'].mode()[0]
test_df.loc[:, 'winddirection'] = test_df['winddirection'].fillna(most_frequent_winddir)

# Verify that missing values are handled
print("\nMissing Values After Handling:")
print(test_df.isnull().sum())

# ==============================
# Exploratory Data Analysis (EDA)
# ==============================

# Distribution of the Target Variable (Rainfall) - FIXED WARNING
plt.figure(figsize=(6, 4))
sns.countplot(x='rainfall', data=train_df, hue='rainfall', palette="viridis", legend=False)
plt.title("Distribution of Rainfall (Target Variable)")
plt.xlabel("Rainfall (0 = No, 1 = Yes)")
plt.ylabel("Count")
plt.show()

# Check Correlations Between Features
plt.figure(figsize=(12, 8))
sns.heatmap(train_df.corr(), annot=True, cmap="coolwarm", fmt=".2f", linewidths=0.5)
plt.title("Feature Correlation Heatmap")
plt.show()

# Boxplot to Identify Outliers in Key Features
plt.figure(figsize=(12, 6))
sns.boxplot(data=train_df[['pressure', 'maxtemp', 'mintemp', 'humidity', 'windspeed']])
plt.title("Boxplot of Key Features")
plt.show()

print("\n Data Preprocessing & EDA Completed!")


SyntaxError: invalid syntax (3823421614.py, line 36)

In [3]:
# Function to remove outliers using Interquartile Range (IQR)
def remove_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    df = df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]
    return df

# Remove outliers from key features
for col in ['pressure', 'humidity', 'windspeed']:
    train_df = remove_outliers(train_df, col)

print("Outliers handled successfully!")


✅ Outliers handled successfully!


In [4]:
# Drop unnecessary columns
train_df.drop(columns=['id', 'day', 'temparature', 'mintemp', 'dewpoint'], inplace=True)
test_df.drop(columns=['id', 'day', 'temparature', 'mintemp', 'dewpoint'], inplace=True)

print("Feature selection completed!")


✅ Feature selection completed!


In [5]:
from sklearn.model_selection import train_test_split

# Define features (X) and target variable (y)
X = train_df.drop(columns=['rainfall'])
y = train_df['rainfall']

# Split into train (80%) and validation (20%)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

print("Train-Test Split Completed!")
print("Training samples:", X_train.shape[0])
print("Validation samples:", X_val.shape[0])


✅ Train-Test Split Completed!
Training samples: 1704
Validation samples: 427


In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Train Logistic Regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Predictions on Validation Set
y_pred = model.predict(X_val)

# Evaluate Performance
print("Validation Accuracy:", accuracy_score(y_val, y_pred))
print("\nClassification Report:\n", classification_report(y_val, y_pred))


Validation Accuracy: 0.8758782201405152

Classification Report:
               precision    recall  f1-score   support

           0       0.76      0.63      0.69        93
           1       0.90      0.94      0.92       334

    accuracy                           0.88       427
   macro avg       0.83      0.79      0.81       427
weighted avg       0.87      0.88      0.87       427



In [8]:
from sklearn.preprocessing import StandardScaler

# Initialize scaler
scaler = StandardScaler()

# Fit and transform training data
X_train_scaled = scaler.fit_transform(X_train)

# Apply the same transformation to test data
X_val_scaled = scaler.transform(X_val)

# Train Logistic Regression on scaled data
model = LogisticRegression()
model.fit(X_train_scaled, y_train)

# Perform Cross-Validation with Scaled Data
cv_scores = cross_val_score(model, X_train_scaled, y_train, cv=kf, scoring='accuracy')

print("Cross-Validation Scores (After Scaling):", cv_scores)
print("Mean Accuracy (After Scaling):", cv_scores.mean())


Cross-Validation Scores (After Scaling): [0.87390029 0.87096774 0.85630499 0.82404692 0.87352941]
Mean Accuracy (After Scaling): 0.8597498706227359


In [11]:
from sklearn.preprocessing import StandardScaler

# Ensure test_df has the same feature columns as X_train
test_features = X_train.columns  # Get the correct feature names used in training
test_df = test_df[test_features]  # Select only these columns

# Re-scale test data using the same scaler fitted on training data
test_scaled = scaler.transform(test_df)

# Predict rainfall on the test dataset
test_df['rainfall'] = model.predict(test_scaled)

# Restore `id` column before saving (if it was dropped)
original_test_df = pd.read_csv("test.csv")  # Reload the original file
test_df['id'] = original_test_df['id']

# Save submission file
test_df[['id', 'rainfall']].to_csv("submission.csv", index=False)

print(" Submission file created successfully!")


✅ Submission file created successfully!
