In [1]:
import pandas as pd

# Load the uploaded file

df = pd.read_csv('DATA/weather_prediction_dataset2.csv')

# Display the first few rows to understand its structure
df.head()

# Reload the dataset with the correct delimiter
df = pd.read_csv('DATA/weather_prediction_dataset2.csv', delimiter=';')

# Display the first few rows to understand its structure with the correct delimiter
df.head()

# Convert the DataFrame from wide to long format
df_long = pd.wide_to_long(df, 
                          stubnames='temp_mean', 
                          i=['DATE', 'MONTH'], 
                          j='Ville', 
                          sep='_', 
                          suffix='.+').reset_index()

# Display the transformed DataFrame

print("Long Format Weather Data")
display(df_long.head())  # Affiche les 5 premières lignes du DataFrame

# Inspect the column names to understand the structure better
df.columns

# Since each city has a unique prefix before "_temp_mean", we need to adjust our conversion strategy
# We'll directly perform the wide-to-long transformation manually without relying on suffix-based splitting

df_long_corrected = pd.melt(df, 
                            id_vars=['DATE', 'MONTH'], 
                            var_name='Ville', 
                            value_name='Temperature')

# Display the corrected long-format DataFrame
print("Corrected Long Format Weather Data")
print(df_long_corrected.head())  # Affiche les premières lignes du DataFrame

df_long_corrected.Ville = df_long_corrected.Ville.str.replace("_temp_mean","")
df_long_corrected.Ville

df_long_corrected.DATE= pd.to_datetime(df_long_corrected.DATE, format='%Y%m%d')
#df_long_corrected

data = df_long_corrected.dropna(axis=0)

print(data.head())

data['year'] = pd.to_datetime(data['DATE']).dt.year
data['month'] = pd.to_datetime(data['DATE']).dt.month
data['day'] = pd.to_datetime(data['DATE']).dt.day

data['Ville_code'] = pd.factorize(data['Ville'])[0] + 1

print(data)

Long Format Weather Data


Unnamed: 0,DATE,MONTH,Ville,BASEL_temp_mean,BUDAPEST_temp_mean,DE_BILT_temp_mean,DRESDEN_temp_mean,DUSSELDORF_temp_mean,HEATHROW_temp_mean,KASSEL_temp_mean,...,MALMO_temp_mean,MONTELIMAR_temp_mean,MUENCHEN_temp_mean,OSLO_temp_mean,PERPIGNAN_temp_mean,ROMA_temp_mean,SONNBLICK_temp_mean,STOCKHOLM_temp_mean,TOURS_temp_mean,temp_mean


Corrected Long Format Weather Data
       DATE  MONTH            Ville  Temperature
0  20000101      1  BASEL_temp_mean          2.9
1  20000102      1  BASEL_temp_mean          3.6
2  20000103      1  BASEL_temp_mean          2.2
3  20000104      1  BASEL_temp_mean          3.9
4  20000105      1  BASEL_temp_mean          6.0
        DATE  MONTH  Ville  Temperature
0 2000-01-01      1  BASEL          2.9
1 2000-01-02      1  BASEL          3.6
2 2000-01-03      1  BASEL          2.2
3 2000-01-04      1  BASEL          3.9
4 2000-01-05      1  BASEL          6.0
            DATE  MONTH  Ville  Temperature  year  month  day  Ville_code
0     2000-01-01      1  BASEL          2.9  2000      1    1           1
1     2000-01-02      1  BASEL          3.6  2000      1    2           1
2     2000-01-03      1  BASEL          2.2  2000      1    3           1
3     2000-01-04      1  BASEL          3.9  2000      1    4           1
4     2000-01-05      1  BASEL          6.0  2000      1    5

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor

In [3]:
X = data[['year', 'month','day','Ville_code']]
y = data['Temperature']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
model = RandomForestRegressor()
model.fit(X_train, y_train)

In [5]:
y_pred = model.predict(X_test)
y_pred

array([13.393, 11.13 ,  3.175, ..., 19.086, 20.229, 15.987])

In [6]:
X_test[1:2]

Unnamed: 0,year,month,day,Ville_code
21127,2007,10,28,6


In [7]:
model.predict(pd.DataFrame({"year":[2025],"month":[7],"day":[27],"Ville_code":[18]}))

array([18.149])

In [8]:
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [9]:
print(f"MAE: {mae}")
print(f"MSE: {mse}")
print(f"R²: {r2}")

MAE: 1.5644722158874953
MSE: 4.237178568757127
R²: 0.9409606031692376


In [10]:
import joblib
joblib.dump(model, 'C:\\Users\\theog\\ProjetIA.venv\\.venv\\modele_prediction.pkl')

['C:\\Users\\theog\\ProjetIA.venv\\.venv\\modele_prediction.pkl']