In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error
import numpy as np
from sklearn.model_selection import cross_val_score

# Load data for all five cities
city1 = pd.read_csv('BeijingPMtrain.csv')
city2 = pd.read_csv('ChengduPMtrain.csv')
city3 = pd.read_csv('GuangzhouPMtrain.csv')
city4 = pd.read_csv('ShanghaiPMtrain.csv')
city5 = pd.read_csv('ShenyangPMtrain.csv')

# Combine data for all five cities
data = pd.concat([city1, city2, city3, city4, city5])

# Handling missing values
data = data.fillna(data.mean())

# Renaming columns
data = data.rename(columns={'PM_Dongsi': 'PM_City1', 
                           'PM_Caotangsi': 'PM_City2', 
                           'PM_City Station': 'PM_City3', 
                           'PM_Jingan': 'PM_City4', 
                           'PM_Taiyuanjie': 'PM_City5'})

# Removing unnecessary columns
data = data.drop(['No','season','cbwd','Iws','precipitation','Iprec'], axis=1)

# Normalizing the data
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
data[['PM_City1', 'PM_City2', 'PM_City3', 'PM_City4', 'PM_City5', 'PM_US Post', 'DEWP', 'HUMI', 'PRES', 'TEMP']] = scaler.fit_transform(data[['PM_City1', 'PM_City2', 'PM_City3', 'PM_City4', 'PM_City5', 'PM_US Post', 'DEWP', 'HUMI', 'PRES', 'TEMP']])


X = data.drop(['PM_City1', 'PM_City2', 'PM_City3', 'PM_City4', 'PM_City5'], axis=1)
y = data[['PM_City1', 'PM_City2', 'PM_City3', 'PM_City4', 'PM_City5']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(data.head())

# best algorithm using Mean Squared Error

# Linear Regression
reg = LinearRegression().fit(X_train, y_train)

y_pred = reg.predict(X_test)
lin_mse = mean_squared_error(y_test, y_pred)

lin_mse_cross = -1*cross_val_score(reg, X, y, cv=5, scoring='neg_mean_squared_error')
lin_mse_mean = np.mean(lin_mse_cross)

# Decision Tree
reg = DecisionTreeRegressor().fit(X_train, y_train)

y_pred = reg.predict(X_test)
dt_mse = mean_squared_error(y_test, y_pred)

dt_mse_cross = -1*cross_val_score(reg, X, y, cv=5, scoring='neg_mean_squared_error')
dt_mse_mean = np.mean(dt_mse_cross)


# Neural Network
reg = MLPRegressor().fit(X_train, y_train)

y_pred = reg.predict(X_test)
nn_mse = mean_squared_error(y_test, y_pred)

nn_mse_cross = -1*cross_val_score(reg, X, y, cv=5, scoring='neg_mean_squared_error')
nn_mse_mean = np.mean(nn_mse_cross)

print(" best algorithm using cross Mean Squared Error ")

# Compare mean squared errors
print("Linear Regression MSE:", lin_mse)
print("Decision Tree MSE:", dt_mse)
print("Neural Network MSE:", nn_mse)

# Find the best algorithm
best_algorithm = min(lin_mse, dt_mse, nn_mse)

if best_algorithm == lin_mse:
    print("Linear Regression is the best algorithm")
elif best_algorithm == dt_mse:
    print("Decision Tree is the best algorithm")
else:
    print("Neural Network is the best algorithm")
from sklearn.model_selection import cross_val_score

# best algorithm using cross validation method 

print(" best algorithm using cross validation method ")

print("Linear Regression MSE:", lin_mse_mean)
print("Decision Tree MSE:", dt_mse_mean)
print("Neural Network MSE:", nn_mse_mean)

# Find the best algorithm
best_algorithm = min(lin_mse_mean, dt_mse_mean, nn_mse_mean)

if best_algorithm == lin_mse_mean:
    print("Linear Regression is the best algorithm")
elif best_algorithm == dt_mse_mean:
    print("Decision Tree is the best algorithm")
else:
    print("Neural Network is the best algorithm")
    

from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeRegressor

# Define the parameter grid to search over
param_grid = {'max_depth': [2, 4, 6, 8, 10],
              'min_samples_split': [2, 4, 6, 8, 10],
              'min_samples_leaf': [1, 2, 3, 4, 5]}

# Create the decision tree model
reg = DecisionTreeRegressor()

# Create the GridSearchCV object
grid_search = GridSearchCV(reg, param_grid, cv=5, scoring='neg_mean_squared_error')

# Fit the GridSearchCV object to the data
grid_search.fit(X, y)

# Print the best set of hyperparameters
print("Best hyperparameters:", grid_search.best_params_)



reg = DecisionTreeRegressor(max_depth=4, min_samples_leaf=3, min_samples_split=6)

# Fit the model to the training data
reg.fit(X_train, y_train)
# Make predictions on the test set
y_pred = reg.predict(X_test)
print(y_pred)

# Calculate mean squared error
dt_mse = mean_squared_error(y_test, y_pred)
print("Decision Tree MSE:", dt_mse)

# Making predictions
city = int(input("Enter the city (0 for Beijing, 1 for Chengdu, 2 for Guangzhou, 3 for Shanghai, 4 for Shenyang): "))
day = int(input("Enter the day: "))
month = int(input("Enter the month: "))
year = int(input("Enter the year: "))
hour = int(input("Enter the hour: "))
city_encoded = [0,0,0,0,0]
city_encoded[city] = 1

pm25_pred = reg.predict([np.concatenate((city_encoded, [day, month, year, hour]))])
print("Predicted PM2.5 concentration: ", pm25_pred)


  data = data.fillna(data.mean())


   year  month  day  hour  PM_City1  PM_Dongsihuan  PM_Nongzhanguan  \
0  2010      1    1     0   0.11856      93.723659        89.953275   
1  2010      1    1     1   0.11856      93.723659        89.953275   
2  2010      1    1     2   0.11856      93.723659        89.953275   
3  2010      1    1     3   0.11856      93.723659        89.953275   
4  2010      1    1     4   0.11856      93.723659        89.953275   

   PM_US Post      DEWP      HUMI      PRES      TEMP  PM_City2  PM_Shahepu  \
0     0.07618  0.995113  0.994356  0.647887  0.242857  0.125529   84.143639   
1     0.07618  0.995113  0.994752  0.633803  0.228571  0.125529   84.143639   
2     0.07618  0.995113  0.994356  0.619718  0.242857  0.125529   84.143639   
3     0.07618  0.995113  0.995544  0.619718  0.200000  0.125529   84.143639   
4     0.07618  0.995213  0.995148  0.605634  0.228571  0.125529   84.143639   

   PM_City3  PM_5th Middle School  PM_City4   PM_Xuhui  PM_City5  PM_Xiaoheyan  
0  0.102621      

ValueError: X has 9 features, but DecisionTreeRegressor is expecting 15 features as input.

In [None]:
import pandas as pd
import numpy as np
from keras.models import Sequential
from keras.layers import LSTM, Dense
from sklearn.model_selection import TimeSeriesSplit, cross_val_score
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Load the dataset into a pandas DataFrame
df = df_pc2

# Split the data into features and target
X = df.drop("pm2.5", axis=1)
y = df["pm2.5"]

# Reshape the target data into a 3D format for the LSTM model
y = np.array(y).reshape((len(y), 1))

# Define the LSTM model
model = Sequential()
model.add(LSTM(32, input_shape=(None, 1)))
model.add(Dense(1))
model.compile(loss='mean_squared_error', optimizer='adam')

# Perform time series cross-validation to evaluate the LSTM model
tscv = TimeSeriesSplit(n_splits=10)
scores = []
for train_index, test_index in tscv.split(y):
    y_train, y_test = y[train_index], y[test_index]
    model.fit(y_train, y_train, epochs=10, batch_size=1, verbose=0)
    y_pred = model.predict(y_test)
    score = mean_absolute_error(y_test, y_pred)
    scores.append(score)

# Print the mean cross-validation score
print("Mean Cross-Validation Score:", np.mean(scores))

# Train the LSTM model on the entire dataset
model.fit(y, y, epochs=10, batch_size=1, verbose=0)

# Make predictions on the entire dataset
y_pred = model.predict(y)

# Evaluate the model using mean absolute error and mean squared error
mae = mean_absolute_error(y, y_pred)
mse = mean_squared_error(y, y_pred)

print("Mean Absolute Error:", mae)
print("Mean Squared Error:", mse)
