In [21]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
import plotly.express as px
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
class IQROutlierRemoval(BaseEstimator, TransformerMixin):
    def __init__(self, threshold=1.5):
        self.threshold = threshold

    def fit(self, X, y=None):
        self.bounds_ = {}
        for col in X.select_dtypes(include=['int64', 'float64']):
            Q1 = X[col].quantile(0.25)
            Q3 = X[col].quantile(0.75)
            IQR = Q3 - Q1
            lower_bound = Q1 - self.threshold* IQR
            upper_bound = Q3 + self.threshold* IQR
            self.bounds_[col] = (lower_bound, upper_bound)
        return self
    def transform(self, X, y=None):
        X_out = X.copy()
        mask = pd.Series(True, index=X.index)
        for col, (lower_bound, upper_bound) in self.bounds_.items():
            X_out = X_out[(X_out[col] > lower_bound) & (X_out[col] < upper_bound)]
            mask &= (X_out[col] > lower_bound) & (X_out[col] < upper_bound)
        X_out = X_out[mask]
        if y is not None:
            y_out = y[mask]
            return X_out, y_out
        return X_out
    def fit_transform(self, X, y=None):
        return self.fit(X, y).transform(X, y)



def read_csv(path):
    df = pd.read_csv(path)
    return df

def show_info(df):
    df.info()

def show_distribution(df):
    correlation_matrix = df.corr()
    plt.figure(figsize=(8, 6))
    sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
    plt.title('Correlation Matrix')
    plt.show()
    scaler = StandardScaler()
    scaled_df = df.copy()
    scaled_df[df.columns] = scaler.fit_transform(scaled_df[df.columns])
    sns.pairplot(scaled_df)

def remove_outlier(df):
    iqr_outlier_remover = IQROutlierRemoval()
    df = iqr_outlier_remover.fit_transform(df)
    return df
    
path= 'concrete_data.csv'
df = read_csv(path)
numerical_columns = df.drop(columns=['Strength']).select_dtypes(['int64', 'float64']).columns
numerical_preprocessor = Pipeline([
    # ('remove_outlier', IQROutlierRemoval()),
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler()),
])

categorical_columns = []
categorical_preprocessor = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ohe', OneHotEncoder(sparse_output=False)) ,
])
preprocessor = ColumnTransformer(
    transformers=[
        ('numeric',numerical_preprocessor, numerical_columns),
        ('categorical',categorical_preprocessor, []),
    ],
    remainder='passthrough',
)
print(f'Preprocessor : {preprocessor}')
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', LinearRegression()),
])

X=df.drop(columns=['Strength'])
y=df['Strength']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=120, test_size=0.3)

pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)
print(r2_score(y_test,y_pred))

Preprocessor : ColumnTransformer(remainder='passthrough',
                  transformers=[('numeric',
                                 Pipeline(steps=[('imputer', SimpleImputer()),
                                                 ('scaler', StandardScaler())]),
                                 Index(['Cement', 'Blast Furnace Slag', 'Fly Ash', 'Water', 'Superplasticizer',
       'Coarse Aggregate', 'Fine Aggregate', 'Age'],
      dtype='object')),
                                ('categorical',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(strategy='most_frequent')),
                                                 ('ohe',
                                                  OneHotEncoder(sparse_output=False))]),
                                 [])])
0.5165314542099333
