In [1]:
#importing library
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression,LogisticRegression,Ridge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import LinearSVR,SVR
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.svm import LinearSVC,SVC
from sklearn.metrics import accuracy_score

In [2]:
#loading the dataset
df=pd.read_csv('/kaggle/input/did-it-rain-in-seattle-19482017/seattleWeather_1948-2017.csv')
#showing the dataset
df

Unnamed: 0,DATE,PRCP,TMAX,TMIN,RAIN
0,1948-01-01,0.47,51,42,True
1,1948-01-02,0.59,45,36,True
2,1948-01-03,0.42,45,35,True
3,1948-01-04,0.31,45,34,True
4,1948-01-05,0.17,45,32,True
...,...,...,...,...,...
25546,2017-12-10,0.00,49,34,False
25547,2017-12-11,0.00,49,29,False
25548,2017-12-12,0.00,46,32,False
25549,2017-12-13,0.00,48,34,False


In [3]:
#checking missing values in the dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25551 entries, 0 to 25550
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   DATE    25551 non-null  object 
 1   PRCP    25548 non-null  float64
 2   TMAX    25551 non-null  int64  
 3   TMIN    25551 non-null  int64  
 4   RAIN    25548 non-null  object 
dtypes: float64(1), int64(2), object(2)
memory usage: 998.2+ KB


In [6]:
#Checking for missing  value in the dataset
df.isna().sum()

DATE    0
PRCP    3
TMAX    0
TMIN    0
RAIN    3
dtype: int64

In [15]:
#Preproessing the dataset
def preprocess_input(df):
    #creating the copy of the dataset
    
    df=df.copy()

    #dropping the missing value from the dataset
    df=df.dropna(axis=0).reset_index(drop=True)


    #Converting Rain column the numerical column

    df['RAIN']=df['RAIN'].astype(int)


    #Extracting the new column from the time column


    df['DAY']=pd.to_datetime(df['DATE']).dt.day
    df['MONTH']=pd.to_datetime(df['DATE']).dt.month
    df['YEAR']=pd.to_datetime(df['DATE']).dt.year

    #deleting the date column 

    df=df.drop('DATE',axis=1)

    #splitting the dataset between target and feature


    y=df['PRCP']
    x=df.drop('PRCP',axis=1)


    #Splitting and Scaling the dataset

    x_train,x_test,y_train,y_test=train_test_split(x,y,train_size=0.7,random_state=123,shuffle=True)

    #scaling the features

    scaler=StandardScaler()


    scaler.fit(x_train)

    x_train=pd.DataFrame(scaler.transform(x_train),columns=x_train.columns,index=x_train.index)
    x_test=pd.DataFrame(scaler.transform(x_test),columns=x_test.columns,index=x_test.index)

    

    

    
    return x_train,x_test,y_train,y_test

In [16]:
x_train,x_test,y_train,y_test=preprocess_input(df)
x_train

Unnamed: 0,TMAX,TMIN,RAIN,DAY,MONTH,YEAR
18104,1.368350,1.069506,-0.861275,1.163814,0.139784,0.725314
16292,1.368350,0.844793,-0.861275,-0.757199,0.429275,0.477556
19406,-0.663099,-1.177622,-0.861275,0.372808,-1.307673,0.923520
16772,-0.897496,-0.054058,1.161070,-1.548205,1.587241,0.527107
1261,1.133952,0.844793,-0.861275,-0.079195,-0.149707,-1.554062
...,...,...,...,...,...,...
15377,-1.522558,-1.289978,1.161070,-1.096202,-1.307673,0.378452
21602,-1.131894,-0.615840,1.161070,1.050813,-1.307673,1.220830
17730,-0.038037,0.844793,1.161070,0.146807,0.139784,0.675762
15725,-1.053762,-1.514691,-0.861275,0.485809,-1.597164,0.428004


In [20]:
# training the Model


models={'Linear Regression':LinearRegression(),
'K-Neighbors':KNeighborsRegressor(),
'Decision Tree':DecisionTreeRegressor(),
'Support Vector Machine (Linear Kernel)':LinearSVR(),
'Support Vector Machine (Non-Linear Kernel)':SVR(),
'Neural Network':MLPRegressor(),
'Random Forest':RandomForestRegressor(),
'Gradient Boosting':GradientBoostingRegressor(),
'XGB Boosting ':XGBRegressor(),
'Light GBM':LGBMRegressor(),
'Cat Boosting':CatBoostRegressor(verbose=0)}  

In [19]:
models

{'Linear Regression': LinearRegression(),
 'K-Neighbors': KNeighborsRegressor(),
 'Decision Tree': DecisionTreeRegressor(),
 'Support Vector Machine (Linear Kernel)': LinearSVR(),
 'Support Vector Machine (Non-Linear Kernel)': SVR(),
 'Neural Network': MLPRegressor(),
 'Random Forest': RandomForestRegressor(),
 'Gradient Boosting': GradientBoostingRegressor(),
 'XGB Boosting ': XGBRegressor(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constrain

In [23]:
for name,model in models.items():
    model.fit(x_train,y_train)
    print(name,'has been trained')
    print('The accuracy of the model is',model.score(x_test,y_test))

Linear Regression has been trained
The accuracy of the model is 0.2687595109036267
K-Neighbors has been trained
The accuracy of the model is 0.23092954005337252
Decision Tree has been trained
The accuracy of the model is -0.45495162710671044




Support Vector Machine (Linear Kernel) has been trained
The accuracy of the model is 0.1699070268068622
Support Vector Machine (Non-Linear Kernel) has been trained
The accuracy of the model is 0.27061618133043963
Neural Network has been trained
The accuracy of the model is 0.3193461954303869
Random Forest has been trained
The accuracy of the model is 0.21322897388712725
Gradient Boosting has been trained
The accuracy of the model is 0.32694244227976943
XGB Boosting  has been trained
The accuracy of the model is 0.18396492561039068
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001157 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 262
[LightGBM] [Info] Number of data points in the train set: 17883, number of used features: 6
[LightGBM] [Info] Start training from score 0.105197
Light GBM has been trained
The accuracy of the model is 0.3290119563011964
Cat Boosting has been trained
The accuracy of th