In [26]:
#importing library
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor


In [27]:
#loading the dataset
df=pd.read_csv('/kaggle/input/szeged-weather/weatherHistory.csv')
#showing the dataset
df.head()

Unnamed: 0,Formatted Date,Summary,Precip Type,Temperature (C),Apparent Temperature (C),Humidity,Wind Speed (km/h),Wind Bearing (degrees),Visibility (km),Loud Cover,Pressure (millibars),Daily Summary
0,2006-04-01 00:00:00.000 +0200,Partly Cloudy,rain,9.472222,7.388889,0.89,14.1197,251.0,15.8263,0.0,1015.13,Partly cloudy throughout the day.
1,2006-04-01 01:00:00.000 +0200,Partly Cloudy,rain,9.355556,7.227778,0.86,14.2646,259.0,15.8263,0.0,1015.63,Partly cloudy throughout the day.
2,2006-04-01 02:00:00.000 +0200,Mostly Cloudy,rain,9.377778,9.377778,0.89,3.9284,204.0,14.9569,0.0,1015.94,Partly cloudy throughout the day.
3,2006-04-01 03:00:00.000 +0200,Partly Cloudy,rain,8.288889,5.944444,0.83,14.1036,269.0,15.8263,0.0,1016.41,Partly cloudy throughout the day.
4,2006-04-01 04:00:00.000 +0200,Mostly Cloudy,rain,8.755556,6.977778,0.83,11.0446,259.0,15.8263,0.0,1016.51,Partly cloudy throughout the day.


In [28]:
#checking for preliminary information about that dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 96453 entries, 0 to 96452
Data columns (total 12 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Formatted Date            96453 non-null  object 
 1   Summary                   96453 non-null  object 
 2   Precip Type               95936 non-null  object 
 3   Temperature (C)           96453 non-null  float64
 4   Apparent Temperature (C)  96453 non-null  float64
 5   Humidity                  96453 non-null  float64
 6   Wind Speed (km/h)         96453 non-null  float64
 7   Wind Bearing (degrees)    96453 non-null  float64
 8   Visibility (km)           96453 non-null  float64
 9   Loud Cover                96453 non-null  float64
 10  Pressure (millibars)      96453 non-null  float64
 11  Daily Summary             96453 non-null  object 
dtypes: float64(8), object(4)
memory usage: 8.8+ MB


In [29]:
#checking for missing values
df.isna().sum()

Formatted Date                0
Summary                       0
Precip Type                 517
Temperature (C)               0
Apparent Temperature (C)      0
Humidity                      0
Wind Speed (km/h)             0
Wind Bearing (degrees)        0
Visibility (km)               0
Loud Cover                    0
Pressure (millibars)          0
Daily Summary                 0
dtype: int64

In [47]:
def preprocess_input(df):
    df=df.copy()
    #droppng the summary,daily summary column
    df=df.drop(['Summary','Daily Summary'],axis=1)
    #filling mising  value with the mode of that column
    df['Precip Type']=df['Precip Type'].fillna(df['Precip Type'].mode()[0])
    
    #converting date column to datetime datatype
    df['Formatted Date'] = pd.to_datetime(df['Formatted Date'])
    #Extraciting the date part from the date colum
    df['Day']=df['Formatted Date'].apply(lambda x:x.day)
    df['Month']=df['Formatted Date'].apply(lambda x:x.month)
    df['Year']=df['Formatted Date'].apply(lambda x:x.year)
    df['Hour']=df['Formatted Date'].apply(lambda x:x.hour)
    #dropping the date column
    df=df.drop('Formatted Date',axis=1)
    #encoding the Precip type column with binary encoding
    df['Precip Type']=df['Precip Type'].apply(lambda x:1 if x=='rain' else 0)

    
    #dropping the Loud Cover column

    df=df.drop('Loud Cover',axis=1)

    #spitting the dataset between feature and target

    y=df['Visibility (km)']
    x=df.drop('Visibility (km)',axis=1)

    #tran_test_split
    x_train,x_test,y_train,y_test=train_test_split(x,y,train_size=0.7,random_state=123,shuffle=True)

    #scaling the dataset
    scaler=StandardScaler()

    scaler.fit(x_train)

    x_train=pd.DataFrame(scaler.transform(x_train),columns=x_train.columns,index=x_train.index)
    x_test=pd.DataFrame(scaler.transform(x_test),columns=x_test.columns,index=x_test.index)







    
    return x_train,x_test,y_train,y_test

In [49]:
x_train,x_test,y_train,y_test=preprocess_input(df)
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

  df['Formatted Date'] = pd.to_datetime(df['Formatted Date'])


(67517, 11)
(28936, 11)
(67517,)
(28936,)


In [50]:
x_train

Unnamed: 0,Precip Type,Temperature (C),Apparent Temperature (C),Humidity,Wind Speed (km/h),Wind Bearing (degrees),Pressure (millibars),Day,Month,Year,Hour
13596,0.355013,1.627626,1.501724,-2.024300,1.461847,1.055230,0.006060,1.168975,-0.151306,-1.269615,0.073065
79326,0.355013,0.441592,0.495554,-0.590020,-0.428029,0.393836,0.124929,1.055362,-0.730792,1.264744,-0.359778
72378,-2.816801,-1.472340,-1.832447,0.075895,2.182800,-0.360712,0.170144,-1.671337,-1.310278,0.947949,0.938750
13895,0.355013,-0.328167,-0.300348,0.332017,-0.425696,0.095743,0.033564,-1.671337,-1.020535,-1.269615,1.660155
37609,-2.816801,-1.261296,-1.354283,0.793035,-0.005723,0.365890,0.060556,0.600913,-1.310278,-0.319230,-1.514026
...,...,...,...,...,...,...,...,...,...,...,...
63206,0.355013,-0.529909,-0.594723,0.075895,0.206596,-0.193034,0.233154,0.714525,1.587150,0.631154,0.361627
61404,0.355013,-0.613047,-0.693367,-0.487572,0.290591,0.943445,0.059364,-1.671337,-0.730792,0.631154,0.073065
17730,0.355013,0.033457,0.131090,-0.641245,2.565441,0.952761,0.068560,0.032850,-0.730792,-0.952820,0.938750
28030,-2.816801,-1.882801,-1.882288,0.536914,-0.598351,0.589460,0.047273,0.260075,1.587150,-0.636025,1.515874


In [51]:
y_train

13596     9.9820
79326    16.1000
72378    15.9229
13895    14.7154
37609    14.9569
          ...   
63206     6.9069
61404    11.2700
17730    11.2056
28030     6.3434
15725     5.8926
Name: Visibility (km), Length: 67517, dtype: float64

In [52]:
#Training the Model

models={'Linear Regression':LinearRegression(),
           'Decision Tree':DecisionTreeRegressor(),
           'Random Forest':RandomForestRegressor()}

In [53]:
models

{'Linear Regression': LinearRegression(),
 'Decision Tree': DecisionTreeRegressor(),
 'Random Forest': RandomForestRegressor()}

In [54]:
for name,model in models.items():
    model.fit(x_train,y_train)
    print(name)
    print(model.score(x_test,y_test))

Linear Regression
0.24016638871360263
Decision Tree
0.5073312422903246
Random Forest
0.7578210305149131


In [39]:
x['Loud Cover'].unique()

array([0.])

In [35]:
x.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 96453 entries, 0 to 96452
Data columns (total 13 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Precip Type               96453 non-null  object 
 1   Temperature (C)           96453 non-null  float64
 2   Apparent Temperature (C)  96453 non-null  float64
 3   Humidity                  96453 non-null  float64
 4   Wind Speed (km/h)         96453 non-null  float64
 5   Wind Bearing (degrees)    96453 non-null  float64
 6   Visibility (km)           96453 non-null  float64
 7   Loud Cover                96453 non-null  float64
 8   Pressure (millibars)      96453 non-null  float64
 9   Day                       96453 non-null  int64  
 10  Month                     96453 non-null  int64  
 11  Year                      96453 non-null  int64  
 12  Hour                      96453 non-null  int64  
dtypes: float64(8), int64(4), object(1)
memory usage: 9.6+ MB


In [36]:
df['Precip Type'].value_counts()

Precip Type
rain    85224
snow    10712
Name: count, dtype: int64