In [2]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from tpot import TPOTRegressor

In [3]:
data = pd.read_csv('/content/testset.csv')
data.head(5)

Unnamed: 0,datetime_utc,_conds,_dewptm,_fog,_hail,_heatindexm,_hum,_precipm,_pressurem,_rain,_snow,_tempm,_thunder,_tornado,_vism,_wdird,_wdire,_wgustm,_windchillm,_wspdm
0,19961101-11:00,Smoke,9.0,0,0,,27.0,,1010.0,0,0,30.0,0,0,5.0,280.0,West,,,7.4
1,19961101-12:00,Smoke,10.0,0,0,,32.0,,-9999.0,0,0,28.0,0,0,,0.0,North,,,
2,19961101-13:00,Smoke,11.0,0,0,,44.0,,-9999.0,0,0,24.0,0,0,,0.0,North,,,
3,19961101-14:00,Smoke,10.0,0,0,,41.0,,1010.0,0,0,24.0,0,0,2.0,0.0,North,,,
4,19961101-16:00,Smoke,11.0,0,0,,47.0,,1011.0,0,0,23.0,0,0,1.2,0.0,North,,,0.0


In [4]:
data.columns

Index(['datetime_utc', ' _conds', ' _dewptm', ' _fog', ' _hail',
       ' _heatindexm', ' _hum', ' _precipm', ' _pressurem', ' _rain', ' _snow',
       ' _tempm', ' _thunder', ' _tornado', ' _vism', ' _wdird', ' _wdire',
       ' _wgustm', ' _windchillm', ' _wspdm'],
      dtype='object')

In [5]:
data.isnull().sum()

Unnamed: 0,0
datetime_utc,0
_conds,72
_dewptm,621
_fog,0
_hail,0
_heatindexm,71835
_hum,757
_precipm,100990
_pressurem,232
_rain,0


In [6]:
# Convert '-9999' to NaN for appropriate imputation
data.replace(-9999, np.nan, inplace=True)

In [7]:
# Handle datetime_utc column (convert to datetime)
data['datetime_utc'] = pd.to_datetime(data['datetime_utc'])

In [8]:
# Create new features from datetime (e.g., year, month, day, hour)
data['year'] = data['datetime_utc'].dt.year
data['month'] = data['datetime_utc'].dt.month
data['day'] = data['datetime_utc'].dt.day
data['hour'] = data['datetime_utc'].dt.hour

In [9]:
# Drop the original datetime column
data = data.drop(columns=['datetime_utc'])

In [10]:
# Separate the numeric columns
numeric_columns = data.select_dtypes(include=['float64', 'int64']).columns

In [11]:
# Remove columns with all missing values (which can't be imputed)
numeric_columns = [col for col in numeric_columns if data[col].isnull().sum() < len(data)]

In [12]:
# Handle missing values for numeric columns (using mean imputation)
imputer = SimpleImputer(strategy='mean')
data[numeric_columns] = imputer.fit_transform(data[numeric_columns])

In [13]:
# Handle missing values for categorical columns (using most frequent value)
categorical_columns = data.select_dtypes(include=['object']).columns
imputer_cat = SimpleImputer(strategy='most_frequent')
data[categorical_columns] = imputer_cat.fit_transform(data[categorical_columns])

In [14]:
scaler = StandardScaler()
data[numeric_columns] = scaler.fit_transform(data[numeric_columns])

In [15]:
selected_columns = [' _dewptm', ' _fog', ' _hail',  ' _hum',
                    ' _pressurem', ' _rain', ' _snow', ' _tempm', ' _thunder', ' _tornado',
                    ' _vism', ' _wdird', ' _wspdm', 'year', 'month', 'day', 'hour']

data = data[selected_columns]

In [16]:
X = data.drop(columns=[' _rain'])
y = data[' _rain']

In [17]:
data[' _rain'].values

array([-0.16422001, -0.16422001, -0.16422001, ..., -0.16422001,
       -0.16422001, -0.16422001])

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

rf_model = RandomForestRegressor(random_state=42)
rf_model.fit(X_train, y_train)

In [19]:
# Predict on the test set
y_pred = rf_model.predict(X_test)

# Evaluate the model
print("Mean Absolute Error:", mean_absolute_error(y_test, y_pred))
print("Mean Squared Error:", mean_squared_error(y_test, y_pred))
print("R-squared:", r2_score(y_test, y_pred))

Mean Absolute Error: 0.1928798178123867
Mean Squared Error: 0.5720781876876123
R-squared: 0.4022294155758688


without GA

In [20]:
tpot = TPOTRegressor(verbosity=2, generations=5, population_size=20, random_state=42)

tpot.fit(X_train, y_train)
y_pred_ga = tpot.predict(X_test)
print("Mean Absolute Error (with GA):", mean_absolute_error(y_test, y_pred_ga))
print("Mean Squared Error (with GA):", mean_squared_error(y_test, y_pred_ga))
print("R-squared (with GA):", r2_score(y_test, y_pred_ga))

Optimization Progress:   0%|          | 0/120 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: -0.5979580761495168

Generation 2 - Current best internal CV score: -0.5823264323610784

Generation 3 - Current best internal CV score: -0.5709738902189725

Generation 4 - Current best internal CV score: -0.5709738902189725

Generation 5 - Current best internal CV score: -0.5709738902189725

Best pipeline: RandomForestRegressor(input_matrix, bootstrap=False, max_features=0.4, min_samples_leaf=2, min_samples_split=4, n_estimators=100)
Mean Absolute Error (with GA): 0.17905697062975376
Mean Squared Error (with GA): 0.5478840756651404
R-squared (with GA): 0.4275101006195595


The use of TPOT and its genetic algorithm optimization has resulted in a more accurate and reliable rainfall prediction model. The improved metrics (lower MAE and MSE, higher R-squared) demonstrate the effectiveness of GA in finding a better model configuration. This suggests that using TPOT can help improve the performance of predictive models in weather forecasting scenarios.

#Dataset 2

In [21]:
jaipur = pd.read_csv('/content/JaipurFinalCleanData.csv')
jaipur.head(5)

Unnamed: 0,date,meantempm,maxtempm,mintempm,meantempm_1,meantempm_2,meantempm_3,meandewptm_1,meandewptm_2,meandewptm_3,...,mindewptm_3,maxpressurem_1,maxpressurem_2,maxpressurem_3,minpressurem_1,minpressurem_2,minpressurem_3,precipm_1,precipm_2,precipm_3
0,2016-05-04,34,41,27,35.0,36.0,34.0,6.0,4.0,-1.0,...,-10.0,1009.0,1008.0,1009.0,1000.0,1001.0,999.0,0.0,0.0,0.0
1,2016-05-05,31,38,24,34.0,35.0,36.0,7.0,6.0,4.0,...,-2.0,1008.0,1009.0,1008.0,1001.0,1000.0,1001.0,0.0,0.0,0.0
2,2016-05-06,28,34,21,31.0,34.0,35.0,11.0,7.0,6.0,...,-2.0,1011.0,1008.0,1009.0,1003.0,1001.0,1000.0,5.0,0.0,0.0
3,2016-05-07,30,38,23,28.0,31.0,34.0,13.0,11.0,7.0,...,0.0,1011.0,1011.0,1008.0,1004.0,1003.0,1001.0,0.0,5.0,0.0
4,2016-05-08,34,41,26,30.0,28.0,31.0,10.0,13.0,11.0,...,6.0,1010.0,1011.0,1011.0,1002.0,1004.0,1003.0,0.0,0.0,5.0


In [22]:
jaipur.columns

Index(['date', 'meantempm', 'maxtempm', 'mintempm', 'meantempm_1',
       'meantempm_2', 'meantempm_3', 'meandewptm_1', 'meandewptm_2',
       'meandewptm_3', 'meanpressurem_1', 'meanpressurem_2', 'meanpressurem_3',
       'maxhumidity_1', 'maxhumidity_2', 'maxhumidity_3', 'minhumidity_1',
       'minhumidity_2', 'minhumidity_3', 'maxtempm_1', 'maxtempm_2',
       'maxtempm_3', 'mintempm_1', 'mintempm_2', 'mintempm_3', 'maxdewptm_1',
       'maxdewptm_2', 'maxdewptm_3', 'mindewptm_1', 'mindewptm_2',
       'mindewptm_3', 'maxpressurem_1', 'maxpressurem_2', 'maxpressurem_3',
       'minpressurem_1', 'minpressurem_2', 'minpressurem_3', 'precipm_1',
       'precipm_2', 'precipm_3'],
      dtype='object')

In [23]:
X = jaipur.drop(columns=['date', 'meantempm'])  # Features
y = jaipur['meantempm']  # Target variable

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Random Forest Regressor

In [24]:
rf_model = RandomForestRegressor(random_state=42)
rf_model.fit(X_train, y_train)

# Predict on the test set
y_pred = rf_model.predict(X_test)

# Evaluate the model
print("Random Forest (Without GA):")
print("Mean Absolute Error (MAE):", mean_absolute_error(y_test, y_pred))
print("Mean Squared Error (MSE):", mean_squared_error(y_test, y_pred))
print("R-squared (R²):", r2_score(y_test, y_pred))

Random Forest (Without GA):
Mean Absolute Error (MAE): 0.3458823529411765
Mean Squared Error (MSE): 0.2737382352941176
R-squared (R²): 0.9923994475668925


Without GA

In [25]:
# Initialize TPOTRegressor
tpot = TPOTRegressor(verbosity=2, generations=5, population_size=20, random_state=42)

# Fit the TPOT model on the training data
tpot.fit(X_train, y_train)

# Predict on the test set
y_pred_ga = tpot.predict(X_test)

# Evaluate the model
print("TPOT (With GA):")
print("Mean Absolute Error (MAE):", mean_absolute_error(y_test, y_pred_ga))
print("Mean Squared Error (MSE):", mean_squared_error(y_test, y_pred_ga))
print("R-squared (R²):", r2_score(y_test, y_pred_ga))

Optimization Progress:   0%|          | 0/120 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: -0.12854209511504258

Generation 2 - Current best internal CV score: -0.12390984650132868

Generation 3 - Current best internal CV score: -0.04621604938271602

Generation 4 - Current best internal CV score: -0.04621604938271602

Generation 5 - Current best internal CV score: -0.04621604938271602

Best pipeline: DecisionTreeRegressor(RidgeCV(input_matrix), max_depth=7, min_samples_leaf=3, min_samples_split=4)
TPOT (With GA):
Mean Absolute Error (MAE): 0.017156862745098006
Mean Squared Error (MSE): 0.013888888888888843
R-squared (R²): 0.9996143643283001




The TPOT model, after optimization with GA, also produces MAE, MSE, and R-squared values. These values are likely to be better (lower MAE and MSE, higher R-squared) compared to the Random Forest model without GA. This indicates that the GA optimization has helped find a more accurate and robust model configuration for predicting temperature.