In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
import pickle

### Data Collection

In [3]:
import pandas as pd

class DataCollector:
    def __init__(self, file_path):
        self.file_path = file_path
        self.data = None


    def load_data(self):
        # Load the dataset
        self.data = pd.read_csv(self.file_path)
        print(F"Data loaded from {self.file_path}") 
        return self.data
    
    def get_data_summary(self):
        if self.data is None:
            print("Data not loaded. Please run load_data()first.")
            return
        print("Data Info:")
        print(self.data.info())
        print("\nFirst 5 rows of the data:")
        print(self.data.head())

Collector = DataCollector("C:\\Users\\dell\\Downloads\\weather_prediction_dataset.csv")
Collector.load_data()
Collector.get_data_summary()       

Data loaded from C:\Users\dell\Downloads\weather_prediction_dataset.csv
Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99 entries, 0 to 98
Data columns (total 11 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   DATE                    99 non-null     int64  
 1   MONTH                   99 non-null     int64  
 2   BASEL_cloud_cover       99 non-null     int64  
 3   BASEL_humidity          99 non-null     float64
 4   BASEL_pressure          99 non-null     float64
 5   BASEL_global_radiation  99 non-null     float64
 6   BASEL_precipitation     99 non-null     float64
 7   BASEL_sunshine          99 non-null     float64
 8   BASEL_temp_mean         99 non-null     float64
 9   BASEL_temp_min          99 non-null     float64
 10  BASEL_temp_max          99 non-null     float64
dtypes: float64(8), int64(3)
memory usage: 8.6 KB
None

First 5 rows of the data:
       DATE  MONTH  BASEL_cloud_cover  BASE


### Data Understanding

In [7]:
class DataUnderstanding:
    def __init__(self, data):
        self.data = data
        
    def overview(self):
        print("Shape of data:", self.data.shape) 
        print("\nData types:\n", self.data.dtypes)
        print("\nStatical summary:\n", self.data.isnull().sum())
        print("\nMsssing values:\n", self.data.isnull().sum())
        print("\nStatical summary:\n", self.data.describe(include='all'))

    def unique_values(self, column):
        unique_values = self.data[column].unique()
        print(f"Unique values in '{column}':\n{unique_values}")

understanding = DataUnderstanding(Collector.data)
understanding.overview()
print("Available columns:", Collector.data.columns.tolist())
if "MONTH" in Collector.data.columns:
    understanding.unique_values("MONTH")
else:
    print("Column 'MONTH' not found in data.")
if "DAY" in Collector.data.columns:
    understanding.unique_values("DAY")
else:
    print("Column 'DAY' not found in data.")



Shape of data: (99, 11)

Data types:
 DATE                        int64
MONTH                       int64
BASEL_cloud_cover           int64
BASEL_humidity            float64
BASEL_pressure            float64
BASEL_global_radiation    float64
BASEL_precipitation       float64
BASEL_sunshine            float64
BASEL_temp_mean           float64
BASEL_temp_min            float64
BASEL_temp_max            float64
dtype: object

Statical summary:
 DATE                      0
MONTH                     0
BASEL_cloud_cover         0
BASEL_humidity            0
BASEL_pressure            0
BASEL_global_radiation    0
BASEL_precipitation       0
BASEL_sunshine            0
BASEL_temp_mean           0
BASEL_temp_min            0
BASEL_temp_max            0
dtype: int64

Msssing values:
 DATE                      0
MONTH                     0
BASEL_cloud_cover         0
BASEL_humidity            0
BASEL_pressure            0
BASEL_global_radiation    0
BASEL_precipitation       0
BASEL_sunshine     

### Data Preprocessing

In [9]:
import pandas as pd

class DataPreprocessor:
    def __init__(self, data):
        self.data = data

    def remove_duplicates(self, columns):
        # Remove duplicate rows
        self.data = self.data.drop_duplicates(subset=columns)

    def fill_missing_values_mean(self, columns):
        # Fill missing values with the mean
        for col in columns:
            mode_value = self.data[col].mode()[0]
            self.data[col].fillna(mode_value, inplace=True)

    def encode_categorical(self, columns):
        # Encode categorical variables
        self.data = pd.get_dummies(self.data, columns=columns)

    def normalize_data(self, columns):
        # Normalize numerical columns
        for col in columns:
            min_value = self.data[col].min()
            max_value = self.data[col].max()
            self.data[col] = (self.data[col] - min_value) / (max_value - min_value)

preprocessor = DataPreprocessor(Collector.data)
preprocessor.remove_duplicates(["DATE"])
preprocessor.fill_missing_values_mean(["DATE"])

# Only encode categorical columns that exist in the data
categorical_columns = [col for col in ["MONTH", "DAY"] if col in preprocessor.data.columns]
if categorical_columns:
    preprocessor.encode_categorical(categorical_columns)

preprocessor.normalize_data([col for col in ["DATE", "TEMP", "HUMIDITY", "WIND_SPEED"] if col in preprocessor.data.columns])

processd_data = preprocessor.data
print(processd_data.head())
                        

       DATE  BASEL_cloud_cover  BASEL_humidity  BASEL_pressure  \
0  0.000000                  8            0.89          1.0286   
1  0.003257                  8            0.87          1.0318   
2  0.006515                  5            0.81          1.0314   
3  0.009772                  7            0.79          1.0262   
4  0.013029                  5            0.90          1.0246   

   BASEL_global_radiation  BASEL_precipitation  BASEL_sunshine  \
0                    0.20                 0.03             0.0   
1                    0.25                 0.00             0.0   
2                    0.50                 0.00             3.7   
3                    0.63                 0.35             6.9   
4                    0.51                 0.07             3.7   

   BASEL_temp_mean  BASEL_temp_min  BASEL_temp_max  MONTH_1  MONTH_2  MONTH_3  \
0              2.9             1.6             3.9     True    False    False   
1              3.6             2.7          

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  self.data[col].fillna(mode_value, inplace=True)


### Data Spilting

In [16]:
from sklearn.model_selection import train_test_split

class DataSplitter:
    def __init__(self, data, target_column):
        self.data = data
        self.target_column = target_column

    def split(self, test_size=0.2, random_state=42):
        x = self.data.drop(columns=[self.target_column])
        y = self.data[self.target_column]
        x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=test_size, random_state=random_state)

        self.x_train = x_train
        self.x_test = x_test
        self.y_train = y_train
        self.y_test = y_test

        return x_train, x_test, y_train, y_test

splitter = DataSplitter(processd_data, "BASEL_temp_mean")
x_train, x_test, y_train, y_test = splitter.split()
print("x_train, y_train, x_test, y_test")

x_train, y_train, x_test, y_test


### Data training

In [19]:
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score

class ModelTrainer:
    def __init__(self, x_train, y_train):
        # Drop only columns that exist
        cols_to_drop = [col for col in ["date", "street", "city", "statezip", "country"] if col in x_train.columns]
        self.x_train = x_train.drop(columns=cols_to_drop)
        self.y_train = y_train
        self.model = None

    def train_svr(self, kernel='linear', C=1.0):
        self.model = SVR(kernel=kernel, C=C)
        self.model.fit(self.x_train, self.y_train)
        print("SVR model trained successfully.")

    def evaluate_model(self, x_test, y_test):
        # Drop only columns that exist
        cols_to_drop = [col for col in ["date", "street", "city", "statezip", "country"] if col in x_test.columns]
        x_test = x_test.drop(columns=cols_to_drop)
        predictions = self.model.predict(x_test)
        mse = mean_squared_error(y_test, predictions)
        r2 = r2_score(y_test, predictions)
        print(f"Mean Squared Error: {mse}")
        print(f"R^2 Score: {r2}")

trainer = ModelTrainer(x_train, y_train)
trainer.train_svr(kernel='linear', C=1.0)
trainer.evaluate_model(x_test, y_test)

SVR model trained successfully.
Mean Squared Error: 0.11496485916472785
R^2 Score: 0.9912606650261042


### Model storage using pickle

In [20]:
import pickle

class ModelSaver:
    def __init__(self, model):
        self.model = model

    def save_model(self, filename):
        with open(filename, 'wb') as file:
            pickle.dump(self.model, file)
        print(f"Model saved to {filename}")

    def load_model(self, filename):
        with open(filename, 'rb') as file:
            self.model = pickle.load(file)
        print(f"Model loaded from {filename}")
        return self.model
    
saver = ModelSaver(trainer.model)
saver.save_model("svm_Book1_model.pkl")
loaded_model = saver.load_model("svm_Book1_model.pkl")

Model saved to svm_Book1_model.pkl
Model loaded from svm_Book1_model.pkl
