In [None]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt
from sklearn.metrics import r2_score
from tensorflow.python.keras.losses import mean_squared_error, mean_absolute_error

In [None]:
df = pd.read_csv("/Users/zafiraibraeva/Code/uni coding/thesis/thesis_code/thesis/webapp/dataset/final_data.csv")
df.shape

In [None]:
df["Station1_PM10"].describe()

In [None]:
df["Date"] = df["Unnamed: 0"]
df.drop("Unnamed: 0", axis=1, inplace=True)

In [None]:
df.set_index("Date", inplace=True)

In [None]:
df.isna().sum()

In [None]:
features = [
    "Station1_CO", "Station1_NO2", "Station1_NOx",
    "Station2_CO", "Station2_NO2", "Station2_NOx", "Station2_O3",
    "Station1_PM10", "temp", "humidity", "precip",
    "precipcover", "cloudcover", "windspeed", "visibility",
    "winddir_sin", "winddir_cos", "is_heating_season", "is_work_day"
]
target = "Station2_PM10"

In [None]:
print(df.isnull().sum())
df.fillna(method="ffill", inplace=True) 

In [None]:
df.to_csv("/Users/zafiraibraeva/Code/uni coding/thesis/thesis_code/thesis/data/final_data/result_data.csv")

In [None]:
scaler_target = MinMaxScaler()
df[target] = scaler_target.fit_transform(df[target].values.reshape(-1, 1))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    df[features],
    df[target],
    test_size=0.2,
    random_state=42
)

In [None]:
model = LinearRegression()

In [None]:
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)

In [None]:
y_test_actual = scaler_target.inverse_transform(y_test.values.reshape(-1, 1)).flatten()
y_pred_actual = scaler_target.inverse_transform(y_pred.reshape(-1, 1)).flatten()

In [None]:
plt.figure(figsize=(12, 6))
plt.plot(y_test_actual, label='Actual PM10', marker='.')
plt.plot(y_pred_actual, label='Predicted PM10', marker='.', alpha=0.7, linestyle='dashed')
plt.legend()
plt.title('PM10 Prediction with Linear Regression')
plt.xlabel('Test Sample Index')
plt.ylabel('PM10')
plt.show()

In [None]:
mae = mean_absolute_error(y_test_actual, y_pred_actual)
print(f"MAE: {mae}")
mse = mean_squared_error(y_test_actual, y_pred_actual)
print(f"MSE: {mse}")
r2 = r2_score(y_test_actual, y_pred_actual)
print(f"R2: {r2}")

In [None]:
import joblib

joblib.dump(model, 'pm10_prediction_model.pkl')