#### <center><strong style="color:#363062">Rainfall Forecast</strong></center>

##### <strong style="color:#ff9717"><em>Tiền xử lý dữ liệu</em></strong>

In [None]:
# Import library

import pandas as pd
import matplotlib.pyplot as plt 

In [None]:
# Read data

date_col = 'Date'
rainfall_col = 'RF_LeThuy'

df = pd.read_csv('data.csv')
df = pd.DataFrame({ 'Date': df[date_col], 'Rainfall': df[rainfall_col] })

In [None]:
# Set the correct data type

df['Date'] = pd.to_datetime(df['Date'], format='%m/%d/%Y', errors='coerce')
df['Rainfall'] = pd.to_numeric(df['Rainfall'], errors='coerce')

In [None]:
# Display missing value

for i in range(len(df.columns)):
  missing_data = df[df.columns[i]].isna().sum()
  perc = missing_data / len(df) * 100
  print(f'> Columns {i}[{df.columns[i]}][{df[df.columns[i]].dtypes}]' + 
        f' missing entries: {missing_data}, percentage {perc}%')

In [None]:
# Replace 'x' = mode_value

mode_value = df['Rainfall'].mode()[0]
df['Rainfall'].fillna(mode_value, inplace=True)

In [None]:
df['Month'] = df['Date'].dt.month
monthly_avg = df.groupby('Month')['Rainfall'].mean()

# Show chart
plt.figure(figsize=(10, 6))
bars = plt.bar(monthly_avg.index, monthly_avg.values)
plt.xlabel('Month')
plt.ylabel('Average rainfall')
plt.title('Average rainfall chart by month')

for bar in bars:
    height = bar.get_height()
    plt.annotate(f'{height:.2f}', xy=(bar.get_x() + bar.get_width() / 2, height),
                 xytext=(0, 3), textcoords='offset points', ha='center', va='bottom')

plt.xticks(range(1, 13))
plt.show()

In [None]:
df

In [None]:
# Split data from month

data = df[df['Month'].between(8, 12)].copy()
data.drop(columns=['Date', 'Month'], axis=1, inplace=True)

In [None]:
# Categorize the data into 4 levels

def CategorizeRainFall(x):
  if x == 0: return 0
  elif x <= 15: return 1
  elif x <= 50: return 2
  elif x <= 100: return 3
  else : return 4

data['Rainfall'] = data['Rainfall'].apply(CategorizeRainFall)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, mean_squared_error, classification_report

def ScoreSVM(X, y):
  SVM = SVC(kernel = 'sigmoid')
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42)
  SVM.fit(X_train, y_train)
  y_pred = SVM.predict(X_test)
  # print(classification_report(y_test, y_pred, zero_division=1))
  return accuracy_score(y_test, y_pred)

In [None]:
def ProcessData(data, days):
  df = data.copy() 
  for i in range(days + 1):
    df[f'{i} Days Before'] = df['Rainfall'].shift(-i)
  df = df[:-days]
  X = df.drop(columns = ['Rainfall', '0 Days Before'], axis = 1)
  y = df['Rainfall']
  return X, y

In [None]:
acc = []

for i in range(2, 60):
  X, y = ProcessData(data, i)
  acc.append(ScoreSVM(X, y))

print(acc)
print(max(acc))
