In [108]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score, mean_squared_error

In [109]:
file_path = "dataset/TEMP_ANNUAL_SEASONAL_MEAN.csv"

df = pd.read_csv(file_path)

print(df.head())

     YEAR ANNUAL JAN-FEB MAR-MAY JUN-SEP OCT-DEC
0  1901.0  25.42   20.11   27.64   28.16    23.1
1  1902.0  25.42   20.88   27.96   27.98   22.51
2  1903.0  25.01   19.99   27.02      28   22.33
3  1904.0  24.93   19.76   27.23   27.57   22.56
4  1905.0  24.84   18.36   26.38    28.2   23.13


In [110]:
#INFO
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 123 entries, 0 to 122
Data columns (total 6 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   YEAR     121 non-null    float64
 1   ANNUAL   122 non-null    object 
 2   JAN-FEB  122 non-null    object 
 3   MAR-MAY  122 non-null    object 
 4   JUN-SEP  122 non-null    object 
 5   OCT-DEC  122 non-null    object 
dtypes: float64(1), object(5)
memory usage: 5.9+ KB
None


In [111]:
#DESCRIPTION
print(df.describe())

              YEAR
count   121.000000
mean   1961.000000
std      35.073732
min    1901.000000
25%    1931.000000
50%    1961.000000
75%    1991.000000
max    2021.000000


In [112]:
print(df.isna().any())

YEAR       True
ANNUAL     True
JAN-FEB    True
MAR-MAY    True
JUN-SEP    True
OCT-DEC    True
dtype: bool


In [113]:
df = df.dropna()

In [114]:
print(df.isna().any())

YEAR       False
ANNUAL     False
JAN-FEB    False
MAR-MAY    False
JUN-SEP    False
OCT-DEC    False
dtype: bool


In [115]:
X = df.drop(columns=['ANNUAL', 'YEAR'])
y = df['ANNUAL']

print(X.shape, y.shape)


(121, 4) (121,)


In [116]:
#USING CROSS VALIDATION

lr = LinearRegression()
lr.fit(X, y)
lr_val = cross_val_score(lr, X, y, cv=5)
print(lr_val)


[0.99964387 0.99974763 0.99980858 0.99957379 0.99964638]


In [118]:
#USING TRAIN TEST SPLIT

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
lr = LinearRegression()

lr.fit(X_train, y_train)

y_pred = lr.predict(X_test)
mse = mean_squared_error(y_true=y_test, y_pred=y_pred)
print(mse)
print(y_test[0:10])
print(y_pred[0:10])
print(y_test[0:20])
print(y_pred[10:20])

1.1531091207278751e-05
44     24.69
47     25.29
4      24.84
55     24.95
26     25.01
64     25.23
73     25.08
10     25.08
40     25.61
108    26.05
Name: ANNUAL, dtype: object
[24.68925292 25.29070045 24.83666126 24.94555184 25.00699628 25.23223289
 25.07886196 25.0850512  25.60437602 26.0480495 ]
44     24.69
47     25.29
4      24.84
55     24.95
26     25.01
64     25.23
73     25.08
10     25.08
40     25.61
108    26.05
18     25.15
62     25.24
11      25.2
36        25
90     25.39
118    25.86
110    25.62
0      25.42
89     25.26
104    25.58
Name: ANNUAL, dtype: object
[25.15699277 25.24470651 25.20121206 25.0033758  25.39275915 25.85712814
 25.6178276  25.42268445 25.26476915 25.58359786]
