# Random Forest Prediction 


In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.impute import SimpleImputer


In [2]:
# read in dataset with date column parsed
df = pd.read_csv('cleanedWeatherAUS.csv',
    parse_dates=['Date'],
    index_col='Date')

# drop samples with missing target values
df = df[df['RainTomorrow'].notna()]

# impute missing feature values
imputer = SimpleImputer(strategy='mean')
imputed_df = pd.DataFrame(imputer.fit_transform(df), columns=df.columns, index=df.index)


In [3]:
# split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(imputed_df.drop('RainTomorrow', axis=1), imputed_df['RainTomorrow'], test_size=0.2, random_state=42)



In [5]:
# Initialize the random forest regressor with hyperparameters
rf = RandomForestRegressor(n_estimators=100, max_depth=5, random_state=42)

# Train the random forest regressor on the training data
rf.fit(X_train, y_train)

# Make predictions on the testing data
y_pred = rf.predict(X_test)

# Calculate the mean squared error
mse = mean_squared_error(y_test, y_pred)
print('Mean Squared Error:', mse)

# Calculate the r2 score 
r2_score = rf.score(X_test, y_test)
print('R-Squared Score:', r2_score)


Mean Squared Error: 0.15612987519306606
R-Squared Score: 0.35119407540219605
