In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import pickle
import numpy as np

In [7]:
df = pd.read_csv("small_weather_dataset.csv")
df.head(2)

Unnamed: 0,DATE,MONTH,BASEL_temp_mean,BASEL_humidity,BASEL_pressure,BASEL_precipitation,BASEL_sunshine,RainTomorrow
0,20000101,1,2.9,0.89,1.0286,0.03,0.0,1
1,20000102,1,3.6,0.87,1.0318,0.0,0.0,1


In [5]:
df.tail(2)

Unnamed: 0,DATE,MONTH,BASEL_temp_mean,BASEL_humidity,BASEL_pressure,BASEL_precipitation,BASEL_sunshine,RainTomorrow
198,20000717,7,14.8,0.71,1.0192,0.0,12.0,0
199,20000718,7,16.7,0.66,1.0191,0.0,11.8,0


In [9]:
df.columns

Index(['DATE', 'MONTH', 'BASEL_temp_mean', 'BASEL_humidity', 'BASEL_pressure',
       'BASEL_precipitation', 'BASEL_sunshine', 'RainTomorrow'],
      dtype='object')

In [11]:
df.count()

DATE                   200
MONTH                  200
BASEL_temp_mean        200
BASEL_humidity         200
BASEL_pressure         200
BASEL_precipitation    200
BASEL_sunshine         200
RainTomorrow           200
dtype: int64

In [13]:
df.describe()

Unnamed: 0,DATE,MONTH,BASEL_temp_mean,BASEL_humidity,BASEL_pressure,BASEL_precipitation,BASEL_sunshine,RainTomorrow
count,200.0,200.0,200.0,200.0,200.0,200.0,200.0,200.0
mean,20000400.0,3.815,10.8795,0.7078,1.019582,0.19625,5.1135,0.3
std,191.0765,1.918116,6.723037,0.09375,0.008688,0.415236,4.677882,0.459408
min,20000100.0,1.0,-6.8,0.48,0.992,0.0,0.0,0.0
25%,20000220.0,2.0,5.6,0.64,1.01415,0.0,0.4,0.0
50%,20000410.0,4.0,10.4,0.71,1.01915,0.0,3.9,0.0
75%,20000530.0,5.0,16.7,0.78,1.026025,0.25,8.8,1.0
max,20000720.0,7.0,25.1,0.91,1.0391,4.11,15.3,1.0


In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 8 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   DATE                 200 non-null    int64  
 1   MONTH                200 non-null    int64  
 2   BASEL_temp_mean      200 non-null    float64
 3   BASEL_humidity       200 non-null    float64
 4   BASEL_pressure       200 non-null    float64
 5   BASEL_precipitation  200 non-null    float64
 6   BASEL_sunshine       200 non-null    float64
 7   RainTomorrow         200 non-null    int64  
dtypes: float64(5), int64(3)
memory usage: 12.6 KB


In [17]:
df.shape
print(f"Number of Rows: {df.shape[0]} and Number of Columns: {df.shape[1]}")

Number of Rows: 200 and Number of Columns: 8


In [21]:
df['MONTH'].unique()

array([1, 2, 3, 4, 5, 6, 7], dtype=int64)

In [23]:
df.nunique()

DATE                   200
MONTH                    7
BASEL_temp_mean        138
BASEL_humidity          41
BASEL_pressure         151
BASEL_precipitation     60
BASEL_sunshine          91
RainTomorrow             2
dtype: int64

In [25]:
df.isnull().sum()

DATE                   0
MONTH                  0
BASEL_temp_mean        0
BASEL_humidity         0
BASEL_pressure         0
BASEL_precipitation    0
BASEL_sunshine         0
RainTomorrow           0
dtype: int64

## Data Pre-Processing

In [42]:
df = df.drop('MONTH', axis=1)

KeyError: "['MONTH'] not found in axis"

In [44]:
df = df.drop('BASEL_temp_mean', axis=1)

In [52]:
def fillnaObjectMode(cols):
    for i in cols:
        df[i] = df[i].fillna(df[i].mode()[0])

columns = ['BASEL_pressure', 'BASEL_temp_mean']
fillnaObjectMode(columns)

KeyError: 'BASEL_temp_mean'

In [56]:
def fillnaIntMode(cols):
    for i in cols:
        df[i] = df[i].fillna(df[i].mode()[0])

columns = ['MONTH','BASEL_temp_mean','BASEL_humidity','BASEL_pressure','BASEL_precipitation','BASEL_sunshine','RainTomorrow']
fillnaIntMode(columns)

KeyError: 'MONTH'

In [58]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 5 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   DATE                 200 non-null    int64  
 1   BASEL_humidity       200 non-null    float64
 2   BASEL_pressure       200 non-null    float64
 3   BASEL_precipitation  200 non-null    float64
 4   BASEL_sunshine       200 non-null    float64
dtypes: float64(4), int64(1)
memory usage: 7.9 KB


## Manage Data Types

In [61]:
df['BASEL_humidity'] = df['BASEL_humidity'].astype('int64')

In [63]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 5 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   DATE                 200 non-null    int64  
 1   BASEL_humidity       200 non-null    int64  
 2   BASEL_pressure       200 non-null    float64
 3   BASEL_precipitation  200 non-null    float64
 4   BASEL_sunshine       200 non-null    float64
dtypes: float64(3), int64(2)
memory usage: 7.9 KB


In [67]:
def convertObjToInt(cols):
    for i in cols:
        data = pd.DataFrame({i:df[i].unique()})
        data_LE = LabelEncoder()
        data_LE.fit(np.ravel(data))    
        df[i] = data_LE.transform(df[i])
        
columns = ['BASEL_precipitation', 'BASEL_humidity', 'BASEL_pressure']
convertObjToInt(columns)

NameError: name 'LabelEncoder' is not defined

In [69]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 5 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   DATE                 200 non-null    int64  
 1   BASEL_humidity       200 non-null    int64  
 2   BASEL_pressure       200 non-null    float64
 3   BASEL_precipitation  200 non-null    float64
 4   BASEL_sunshine       200 non-null    float64
dtypes: float64(3), int64(2)
memory usage: 7.9 KB


In [71]:
df.head(2)

Unnamed: 0,DATE,BASEL_humidity,BASEL_pressure,BASEL_precipitation,BASEL_sunshine
0,20000101,0,1.0286,0.03,0.0
1,20000102,0,1.0318,0.0,0.0


In [73]:
df.to_csv('processed_data.csv', index=False, header=True)

In [75]:
df = pd.read_csv('processed_data.csv')
df

Unnamed: 0,DATE,BASEL_humidity,BASEL_pressure,BASEL_precipitation,BASEL_sunshine
0,20000101,0,1.0286,0.03,0.0
1,20000102,0,1.0318,0.00,0.0
2,20000103,0,1.0314,0.00,3.7
3,20000104,0,1.0262,0.35,6.9
4,20000105,0,1.0246,0.07,3.7
...,...,...,...,...,...
195,20000714,0,1.0116,1.09,0.1
196,20000715,0,1.0124,0.64,2.4
197,20000716,0,1.0166,1.17,2.6
198,20000717,0,1.0192,0.00,12.0
