In [6]:
import pandas as pd

In [7]:
df = pd.read_csv('artifact\\data.csv')
df.head()

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [8]:
## Feature Selection

X = df.drop('math_score',axis=1)
y = df['math_score']

In [9]:
X.head()

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,reading_score,writing_score
0,female,group B,bachelor's degree,standard,none,72,74
1,female,group C,some college,standard,completed,90,88
2,female,group B,master's degree,standard,none,95,93
3,male,group A,associate's degree,free/reduced,none,57,44
4,male,group C,some college,standard,none,78,75


### Dividing Features based on their Data Types for Data Transformation


In [10]:
cat_features = [feature for feature in df.columns if df[feature].dtype=='O']
cat_features

['gender',
 'race_ethnicity',
 'parental_level_of_education',
 'lunch',
 'test_preparation_course']

In [11]:
num_features = [feature for feature in df.columns if feature not in cat_features]
num_features.remove('math_score')

In [12]:
num_features

['reading_score', 'writing_score']

### data tranformation


In [13]:

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer


In [14]:
transformer = ColumnTransformer(

[('Standardization',StandardScaler(),num_features),
 ('OneHotEncoding',OneHotEncoder(),cat_features)]

    )

In [15]:
transformed_data = transformer.fit_transform(X)

In [16]:
pd.DataFrame(transformed_data)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18
0,0.193999,0.391492,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
1,1.427476,1.313269,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0
2,1.770109,1.642475,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0
3,-0.833899,-1.583744,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
4,0.605158,0.457333,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,2.044215,1.774157,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0
996,-0.970952,-0.859491,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
997,0.125472,-0.201079,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
998,0.605158,0.589015,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0


In [17]:
transformed_data.shape

(1000, 19)

In [18]:
### Train test split

from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [19]:
pd.DataFrame(X_train).head()

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,reading_score,writing_score
29,female,group D,master's degree,standard,none,70,75
535,female,group C,bachelor's degree,free/reduced,completed,83,83
695,female,group D,some college,free/reduced,none,89,86
557,male,group C,master's degree,free/reduced,none,67,66
836,male,group E,high school,standard,none,64,57


In [20]:
pd.DataFrame(y_train).head()


Unnamed: 0,math_score
29,62
535,66
695,79
557,61
836,73


In [21]:
pd.DataFrame(X_test).head()


Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,reading_score,writing_score
521,female,group C,associate's degree,standard,none,86,84
737,female,group B,some college,free/reduced,completed,66,73
740,male,group D,bachelor's degree,standard,none,73,72
660,male,group C,some college,free/reduced,none,77,73
411,male,group E,some college,standard,completed,83,78


In [22]:
pd.DataFrame(y_test).head()


Unnamed: 0,math_score
521,91
737,53
740,80
660,74
411,84


In [23]:
import xgboost

In [24]:
import sys
print(sys.executable)

e:\Machine Learning Projects\mlproject\venv\python.exe


In [25]:
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor,GradientBoostingRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor

In [26]:
models = {
                        "LinearRegression": LinearRegression(),
                        "SVR": SVR(),
                        "KNeighborsRegressor": KNeighborsRegressor(),
                        "DecisionTreeRegressor": DecisionTreeRegressor(),
                        "RandomForestRegressor": RandomForestRegressor(),
                        "AdaBoostRegressor": AdaBoostRegressor(),
                        "GradientBoostingRegressor": GradientBoostingRegressor(),
                        "XGBRegressor": XGBRegressor(),
                        "CatBoostRegressor": CatBoostRegressor(verbose=0)  # Suppress verbose output
                    }

In [27]:
def evaluate_models(X_train,y_train,X_test,y_test,models):
        
    try:
        report = {}

        for model_name, model in models.items():
            model.fit(X_train,y_train)

            y_pred_train = model.predict(X_train)
            y_pred_test = model.predict(X_test)

            train_model_score = r2_score(y_train,y_pred_train)
            test_model_score = r2_score(y_test,y_pred_test)

            report[model_name]=test_model_score
    except Exception as e:
        print(e)

In [28]:
evaluate_models(X_train=X_train,y_train=y_train,X_test=X_test,y_test=y_test,models=models)

could not convert string to float: 'female'


In [None]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

class DataTransformation:
    def __init__(self):
        def get_data_transformer_object(self):
            try:
                numerical_columns = ['reading_score', 'writing_score']
                categorical_columns = [
                    'gender','race_ethnicity',
                    'parental_level_of_education','lunch',
                    'test_preparation_course'
                ]

                num_pipeline = Pipeline(

                    steps=[
                        ('imputer',SimpleImputer(strategy='median')),
                        ('scaler',StandardScaler())
                        ]
                )

                cat_pipeline = Pipeline(

                    steps=[
                        ('imputer',SimpleImputer(strategy='most_frequent')),
                        ('one_hot_encoder',OneHotEncoder(sparse=False)),
                        ('scaler',StandardScaler(with_mean=False))
                        ]
                )

                preprocessor = ColumnTransformer(

                    [
                        ('numerical_pipeline',num_pipeline,numerical_columns),
                        ('categorical_pipeline',cat_pipeline,categorical_columns)
                    ]
                )

                return preprocessor
            except Exception as e:
                print(e)
            

In [None]:

transform = get_data_transformer_object()
transform.fit_transform(X_train)
transform.transform(X_test)

TypeError: get_data_transformer_object() missing 1 required positional argument: 'self'