In [77]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import pickle
import numpy as np

In [83]:
df = pd.read_csv("small_weather_dataset.csv")
df.head(3)

Unnamed: 0,DATE,MONTH,BASEL_temp_mean,BASEL_humidity,BASEL_pressure,BASEL_precipitation,BASEL_sunshine,RainTomorrow
0,20000101,1,2.9,0.89,1.0286,0.03,0.0,1
1,20000102,1,3.6,0.87,1.0318,0.0,0.0,1
2,20000103,1,2.2,0.81,1.0314,0.0,3.7,0


In [81]:
df.tail(3)

Unnamed: 0,DATE,MONTH,BASEL_temp_mean,BASEL_humidity,BASEL_pressure,BASEL_precipitation,BASEL_sunshine,RainTomorrow
197,20000716,7,12.7,0.81,1.0166,1.17,2.6,1
198,20000717,7,14.8,0.71,1.0192,0.0,12.0,0
199,20000718,7,16.7,0.66,1.0191,0.0,11.8,0


In [85]:
df.columns

Index(['DATE', 'MONTH', 'BASEL_temp_mean', 'BASEL_humidity', 'BASEL_pressure',
       'BASEL_precipitation', 'BASEL_sunshine', 'RainTomorrow'],
      dtype='object')

In [87]:
df.count()

DATE                   200
MONTH                  200
BASEL_temp_mean        200
BASEL_humidity         200
BASEL_pressure         200
BASEL_precipitation    200
BASEL_sunshine         200
RainTomorrow           200
dtype: int64

In [89]:
df.describe()

Unnamed: 0,DATE,MONTH,BASEL_temp_mean,BASEL_humidity,BASEL_pressure,BASEL_precipitation,BASEL_sunshine,RainTomorrow
count,200.0,200.0,200.0,200.0,200.0,200.0,200.0,200.0
mean,20000400.0,3.815,10.8795,0.7078,1.019582,0.19625,5.1135,0.3
std,191.0765,1.918116,6.723037,0.09375,0.008688,0.415236,4.677882,0.459408
min,20000100.0,1.0,-6.8,0.48,0.992,0.0,0.0,0.0
25%,20000220.0,2.0,5.6,0.64,1.01415,0.0,0.4,0.0
50%,20000410.0,4.0,10.4,0.71,1.01915,0.0,3.9,0.0
75%,20000530.0,5.0,16.7,0.78,1.026025,0.25,8.8,1.0
max,20000720.0,7.0,25.1,0.91,1.0391,4.11,15.3,1.0


In [91]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 8 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   DATE                 200 non-null    int64  
 1   MONTH                200 non-null    int64  
 2   BASEL_temp_mean      200 non-null    float64
 3   BASEL_humidity       200 non-null    float64
 4   BASEL_pressure       200 non-null    float64
 5   BASEL_precipitation  200 non-null    float64
 6   BASEL_sunshine       200 non-null    float64
 7   RainTomorrow         200 non-null    int64  
dtypes: float64(5), int64(3)
memory usage: 12.6 KB


In [93]:
df.shape
print(f"Number of Rows: {df.shape[0]} and Number of Columns: {df.shape[1]}")

Number of Rows: 200 and Number of Columns: 8


In [95]:
df['MONTH'].unique()

array([1, 2, 3, 4, 5, 6, 7], dtype=int64)

In [97]:
df.nunique()

DATE                   200
MONTH                    7
BASEL_temp_mean        138
BASEL_humidity          41
BASEL_pressure         151
BASEL_precipitation     60
BASEL_sunshine          91
RainTomorrow             2
dtype: int64

In [99]:
df.isnull().sum()

DATE                   0
MONTH                  0
BASEL_temp_mean        0
BASEL_humidity         0
BASEL_pressure         0
BASEL_precipitation    0
BASEL_sunshine         0
RainTomorrow           0
dtype: int64

## Data Pre-Processing

In [None]:
df = df.drop('MONTH', axis=1)

In [None]:
df = df.drop('BASEL_temp_mean', axis=1)

In [None]:
def fillnaObjectMode(cols):
    for i in cols:
        df[i] = df[i].fillna(df[i].mode()[0])

columns = ['BASEL_pressure', 'BASEL_temp_mean']
fillnaObjectMode(columns)

In [None]:
def fillnaIntMode(cols):
    for i in cols:
        df[i] = df[i].fillna(df[i].mode()[0])

columns = ['MONTH','BASEL_temp_mean','BASEL_humidity','BASEL_pressure','BASEL_precipitation','BASEL_sunshine','RainTomorrow']
fillnaIntMode(columns)

In [102]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 8 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   DATE                 200 non-null    int64  
 1   MONTH                200 non-null    int64  
 2   BASEL_temp_mean      200 non-null    float64
 3   BASEL_humidity       200 non-null    float64
 4   BASEL_pressure       200 non-null    float64
 5   BASEL_precipitation  200 non-null    float64
 6   BASEL_sunshine       200 non-null    float64
 7   RainTomorrow         200 non-null    int64  
dtypes: float64(5), int64(3)
memory usage: 12.6 KB


## Manage Data Types

In [105]:
df['BASEL_humidity'] = df['BASEL_humidity'].astype('int64')

In [107]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 8 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   DATE                 200 non-null    int64  
 1   MONTH                200 non-null    int64  
 2   BASEL_temp_mean      200 non-null    float64
 3   BASEL_humidity       200 non-null    int64  
 4   BASEL_pressure       200 non-null    float64
 5   BASEL_precipitation  200 non-null    float64
 6   BASEL_sunshine       200 non-null    float64
 7   RainTomorrow         200 non-null    int64  
dtypes: float64(4), int64(4)
memory usage: 12.6 KB


In [None]:
def convertObjToInt(cols):
    for i in cols:
        data = pd.DataFrame({i:df[i].unique()})
        data_LE = LabelEncoder()
        data_LE.fit(np.ravel(data))    
        df[i] = data_LE.transform(df[i])
        
columns = ['BASEL_precipitation', 'BASEL_humidity', 'BASEL_pressure']
convertObjToInt(columns)

In [109]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 8 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   DATE                 200 non-null    int64  
 1   MONTH                200 non-null    int64  
 2   BASEL_temp_mean      200 non-null    float64
 3   BASEL_humidity       200 non-null    int64  
 4   BASEL_pressure       200 non-null    float64
 5   BASEL_precipitation  200 non-null    float64
 6   BASEL_sunshine       200 non-null    float64
 7   RainTomorrow         200 non-null    int64  
dtypes: float64(4), int64(4)
memory usage: 12.6 KB


In [111]:
df.head(3)

Unnamed: 0,DATE,MONTH,BASEL_temp_mean,BASEL_humidity,BASEL_pressure,BASEL_precipitation,BASEL_sunshine,RainTomorrow
0,20000101,1,2.9,0,1.0286,0.03,0.0,1
1,20000102,1,3.6,0,1.0318,0.0,0.0,1
2,20000103,1,2.2,0,1.0314,0.0,3.7,0


In [113]:
df.to_csv('processed_data.csv', index=False, header=True)

In [115]:
df = pd.read_csv('processed_data.csv')
df

Unnamed: 0,DATE,MONTH,BASEL_temp_mean,BASEL_humidity,BASEL_pressure,BASEL_precipitation,BASEL_sunshine,RainTomorrow
0,20000101,1,2.9,0,1.0286,0.03,0.0,1
1,20000102,1,3.6,0,1.0318,0.00,0.0,1
2,20000103,1,2.2,0,1.0314,0.00,3.7,0
3,20000104,1,3.9,0,1.0262,0.35,6.9,1
4,20000105,1,6.0,0,1.0246,0.07,3.7,1
...,...,...,...,...,...,...,...,...
195,20000714,7,13.7,0,1.0116,1.09,0.1,1
196,20000715,7,12.6,0,1.0124,0.64,2.4,1
197,20000716,7,12.7,0,1.0166,1.17,2.6,1
198,20000717,7,14.8,0,1.0192,0.00,12.0,0


## Train - Test Splitting

In [118]:
import pandas as pd
df = pd.read_csv('processed_data.csv')
df.columns

Index(['DATE', 'MONTH', 'BASEL_temp_mean', 'BASEL_humidity', 'BASEL_pressure',
       'BASEL_precipitation', 'BASEL_sunshine', 'RainTomorrow'],
      dtype='object')

In [120]:
X = df.drop('BASEL_temp_mean', axis=1)  
y = df['BASEL_temp_mean']

In [124]:
from sklearn.model_selection import train_test_split
train_X, test_X, train_y, test_y = train_test_split(X,y, test_size=0.3, shuffle=False) 

In [126]:
print(train_X.shape, train_y.shape)
print(test_X.shape, test_y.shape)

(140, 7) (140,)
(60, 7) (60,)


## Applying Classifier / Model Training

In [131]:
from sklearn.svm import SVC 

model_svc = SVC()
model_svc.fit(train_X, train_y)

print(model_svc)

ValueError: Unknown label type: continuous. Maybe you are trying to fit a classifier, which expects discrete classes on a regression target with continuous values.

In [133]:
model_pred_svc = model_svc.predict(test_X)
model_pred_svc

AttributeError: 'SVC' object has no attribute 'support_vectors_'

In [136]:
from sklearn.metrics import accuracy_score
model_acc_svc = accuracy_score(test_y, model_pred_svc)
print(round(model_acc_svc,3)*100, "%")

NameError: name 'model_pred_svc' is not defined

In [138]:
import pickle
pickle.dump(model_svc, open('model_svc.pkl', 'wb'))

In [140]:
import pickle
model_svc = pickle.load(open('model_svc.pkl', 'rb'))