In [None]:

import warnings
warnings.filterwarnings('ignore')

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score as acc
from sklearn.metrics import f1_score as f1

In [None]:
!pip install category_encoders
from category_encoders.ordinal import OrdinalEncoder
from category_encoders.one_hot import OneHotEncoder
from category_encoders.target_encoder import TargetEncoder


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
df = pd.read_csv('/content/df2.csv', delimiter = ',')
df.head()

Unnamed: 0,Make,Model,Style,Distance,Engine_capacity(cm3),Fuel_type,Transmission,Price(euro),Age,km_year,class,prestige
0,Toyota,Prius,Hatchback,195000.0,1800.0,Hybrid,Automatic,7750.0,11,17727.272727,4,2
1,Renault,Grand Scenic,Universal,135000.0,1500.0,Diesel,Manual,8550.0,8,16875.0,4,2
2,Renault,Laguna,Universal,110000.0,1500.0,Diesel,Manual,6550.0,10,11000.0,3,3
3,Opel,Astra,Universal,200000.0,1600.0,Metan/Propan,Manual,4100.0,16,12500.0,3,3
4,Mercedes,Vito,Microvan,300000.0,2200.0,Diesel,Manual,3490.0,22,13636.363636,3,4


In [None]:
df=df.drop("class",axis=1)


In [None]:
df.Transmission.unique()

array(['Automatic', 'Manual'], dtype=object)

In [None]:
X = df.drop('Transmission', axis=1)
y = df["Transmission"] 
y.value_counts(normalize=True)

Manual       0.54234
Automatic    0.45766
Name: Transmission, dtype: float64

In [None]:
num_cols =X.select_dtypes(include=np.number).columns
cat_cols = X.columns[X.dtypes == 'object']

print(f"We have {len(num_cols)} numeric columns: {', '.join(num_cols)}")
print(f"And {len(cat_cols)} categorical columns: {', '.join(cat_cols)}")

We have 6 numeric columns: Distance, Engine_capacity(cm3), Price(euro), Age, km_year, prestige
And 4 categorical columns: Make, Model, Style, Fuel_type


In [None]:
binary_cols = cat_cols[X[cat_cols].nunique() == 2].tolist()
cat_cols = cat_cols.difference(binary_cols).tolist()

In [None]:
display(cat_cols, binary_cols)

['Make', 'Model', 'Style', 'Fuel_type']

[]

In [None]:
from sklearn.base import TransformerMixin
# нет удобной реализации - напишем сами 
class CustomScaler(TransformerMixin):
    def __init__(self, cols, scaler=None):
        self.cols = cols
        self.scaler = scaler or StandardScaler()
        
    def fit(self, X, y=None):
        num_cols = X.copy()[self.cols]
        self.scaler.fit(num_cols)
        return self
    def transform(self, X, y=None):
        X_res = X.copy()
        num_cols_tr = self.scaler.transform(X_res[self.cols])
        for i, col in enumerate(self.cols):
            X_res[col] = num_cols_tr[:,i]
        return X_res

In [None]:
sc = CustomScaler(num_cols)
X2 = sc.fit_transform(X[num_cols])
X2.head()

Unnamed: 0,Distance,Engine_capacity(cm3),Price(euro),Age,km_year,prestige
0,0.162384,-0.112381,-0.18142,-0.371928,0.392292,-0.450773
1,-0.500306,-0.604333,-0.097487,-0.790766,0.274765,-0.450773
2,-0.776426,-0.604333,-0.30732,-0.511541,-0.535391,0.440272
3,0.217608,-0.440349,-0.564365,0.326136,-0.328543,0.440272
4,1.322091,0.543555,-0.628364,1.163813,-0.171839,1.331318


числовые столбцы

In [None]:
from sklearn.compose import ColumnTransformer

p1 = Pipeline([
    ("selector", ColumnTransformer([
        ("selector", "passthrough", num_cols)
    ], remainder="drop")),
    ('model_', LogisticRegression())
    ])
p1

Pipeline(steps=[('selector',
                 ColumnTransformer(transformers=[('selector', 'passthrough',
                                                  Index(['Distance', 'Engine_capacity(cm3)', 'Price(euro)', 'Age', 'km_year',
       'prestige'],
      dtype='object'))])),
                ('model_', LogisticRegression())])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [None]:


p1.fit(X_train, y_train)

y_pred = p1.predict(X_test)

print(acc(y_test, y_pred))

0.8039655817433595




нормализированные числовые столбцы

In [None]:
p2 = Pipeline([
    ("selector", ColumnTransformer([
        ("selector", "passthrough", num_cols)
    ], remainder="drop")),
    #('scaler_', CustomScaler(list(num_cols))),
    ('scaler_', StandardScaler()),
    ('model_', LogisticRegression())
    ])


In [None]:
p2.fit(X_train, y_train)

y_pred = p2.predict(X_test)

print(acc(y_test, y_pred))

0.8141912956727771


 pipelines

In [None]:
# числовые и категориальные столбцы
p3 = Pipeline([
    ('one_hot_encoder_', OneHotEncoder(cols=binary_cols+cat_cols)),
    #('scaler_', CustomScaler(list(num_cols))),
    #('scaler_', StandardScaler()),
    ('model_', LogisticRegression())
    ])
#нормализированные числовые и категориальные столбцы
p4 = Pipeline([
    ('one_hot_encoder_', OneHotEncoder(cols=binary_cols+cat_cols)),
    #('scaler_', CustomScaler(list(num_cols))),
    ('scaler_', StandardScaler()),
    ('model_', LogisticRegression())
    ])
p5 = Pipeline([
    ('one_hot_encoder_', OneHotEncoder(cols=binary_cols+cat_cols)),
    #('scaler_', CustomScaler(list(num_cols))),
    ('scaler_', StandardScaler()),
    ('model_', LogisticRegression(penalty='l1', C=1e-1, solver='saga'))
    ])
p6 = Pipeline([
    ('one_hot_encoder_', OneHotEncoder(cols=binary_cols+cat_cols)),
    #('scaler_', CustomScaler(list(num_cols))),
    ('scaler_', StandardScaler()),
    ('model_', LogisticRegression(penalty='l2', C=1e-1, solver='saga'))
    ])

In [None]:
from sklearn.model_selection import cross_validate
for i, pipe in enumerate([p1, p2, p3, p4, p5,p6]):
    cv_res = cross_validate(pipe,
                            X,
                            y,
                            cv=5,
                            scoring='accuracy'
                           )
    print(f"Pipeline {i + 1}: mean cv accuracy = {cv_res['test_score'].mean()}")

Pipeline 1: mean cv accuracy = 0.7736467698655101
Pipeline 2: mean cv accuracy = 0.8089728310124411
Pipeline 3: mean cv accuracy = 0.7740832468725249
Pipeline 4: mean cv accuracy = 0.8527154332360937
Pipeline 5: mean cv accuracy = 0.8522789222037433
Pipeline 6: mean cv accuracy = 0.8527778016765254


In [None]:
#нормализированные числовые столбцы  и категориальные столбцы
p7 = Pipeline([
    ('one_hot_encoder_', OneHotEncoder(cols=binary_cols+cat_cols)),
    ('scaler_', CustomScaler(list(num_cols))),
    #('scaler_', StandardScaler()),
    ('model_', LogisticRegression())
    ])
p8 = Pipeline([
    ('one_hot_encoder_', OneHotEncoder(cols=binary_cols+cat_cols)),
    ('scaler_', CustomScaler(list(num_cols))),
    #('scaler_', StandardScaler()),
    ('model_', LogisticRegression(penalty='l1', C=1e-1, solver='saga'))
    ])

p9 = Pipeline([
    ('one_hot_encoder_', OneHotEncoder(cols=binary_cols+cat_cols)),
    ('scaler_', CustomScaler(list(num_cols))),
    #('scaler_', StandardScaler()),
    ('model_', LogisticRegression(penalty='l2', C=1e-1, solver='saga'))
    ])



In [None]:
p7.fit(X_train, y_train)

y_pred = p7.predict(X_test)

print(acc(y_test, y_pred))

0.8572141164733758


In [None]:
for i, pipe in enumerate([p7, p8, p9]):
    cv_res = cross_validate(pipe,
                            X,
                            y,
                            cv=5,
                            scoring='accuracy'
                           )
    print(f"Pipeline {i + 1}: mean cv accuracy = {cv_res['test_score'].mean()}")

Pipeline 1: mean cv accuracy = 0.8520295067711634
Pipeline 2: mean cv accuracy = 0.8480386654193952
Pipeline 3: mean cv accuracy = 0.8500964253711374


In [None]:
p9

Pipeline(steps=[('one_hot_encoder_',
                 OneHotEncoder(cols=['Make', 'Model', 'Style', 'Fuel_type'])),
                ('scaler_', <__main__.CustomScaler object at 0x7f2cd18ee220>),
                ('model_', LogisticRegression(C=0.1, solver='saga'))])

In [None]:
#(нормализированные числовые столбцы и категориальные столбцы ) (pol degree=2)
p10=Pipeline([
    #('poly_features', PolynomialFeatures(degree=2)),
    ('one_hot_encoder_', OneHotEncoder(cols=binary_cols+cat_cols)),
    ('scaler_', CustomScaler(list(num_cols))),
    ('poly_features', PolynomialFeatures(degree=2)),
    #('scaler_', StandardScaler()),
    ('model_', LogisticRegression())
    ])


In [None]:

p10.fit(X_train, y_train)

y_pred = p10.predict(X_test)

print(acc(y_test, y_pred))

0.8655692729766804


In [None]:
p10

Pipeline(steps=[('one_hot_encoder_',
                 OneHotEncoder(cols=['Make', 'Model', 'Style', 'Fuel_type'])),
                ('scaler_', <__main__.CustomScaler object at 0x7f2cd4187c70>),
                ('poly_features', PolynomialFeatures()),
                ('model_', LogisticRegression())])

In [None]:
#(нормализированные числовые столбцы(pol degree=1,2,3,4,5) и категориальные столбцы ) 
degrees=[1,2,3,4,5]
for deg in degrees:
  numerical_transformer = Pipeline([('scaler',CustomScaler(list(num_cols)) ),
                                  ('polynom', PolynomialFeatures(deg))
    
  ])
  # Apply the numerical transformer only on the numerical columns.
  # Spearately, apply the OneHotEncoder.
  ct = ColumnTransformer([('num_transformer', numerical_transformer, num_cols),
                          ('encoder', OneHotEncoder(), cat_cols)])
  # Main pipeline for fitting.
  pipeline = Pipeline([
                     ('column_transformer', ct),
                     ('linear_regression', LogisticRegression() )
  ])

  pipeline.fit(X_train, y_train)
  y_pred = pipeline.predict(X_test)
  print(f"Pipeline {deg + 1}: accuracy = {acc(y_test, y_pred)}")
  #print(acc(y_test, y_pred))


Pipeline 2: accuracy = 0.8573388203017832
Pipeline 3: accuracy = 0.8617034542960469
Pipeline 4: accuracy = 0.858211747100636
Pipeline 5: accuracy = 0.8441202144905848
Pipeline 6: accuracy = 0.7972315750093528


Pipeline 3: accuracy = 0.8617034542960469

In [None]:
pipeline

Pipeline(steps=[('column_transformer',
                 ColumnTransformer(transformers=[('num_transformer',
                                                  Pipeline(steps=[('scaler',
                                                                   <__main__.CustomScaler object at 0x7f2cd18eecd0>),
                                                                  ('polynom',
                                                                   PolynomialFeatures(degree=5))]),
                                                  Index(['Distance', 'Engine_capacity(cm3)', 'Price(euro)', 'Age', 'km_year',
       'prestige'],
      dtype='object')),
                                                 ('encoder', OneHotEncoder(),
                                                  ['Make', 'Model', 'Style',
                                                   'Fuel_type'])])),
                ('linear_regression', LogisticRegression())])

In [None]:
X = df.drop('Transmission', axis=1)
y = df["Transmission"] 
y.value_counts(normalize=True)

Manual       0.54234
Automatic    0.45766
Name: Transmission, dtype: float64

In [None]:
y=y.map({"Automatic":1,"Manual":0}).astype(int)
y.value_counts(normalize=True)

0    0.54234
1    0.45766
Name: Transmission, dtype: float64

In [None]:
y.value_counts(normalize=True)

0    0.54234
1    0.45766
Name: Transmission, dtype: float64

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [None]:
p13 = Pipeline([
    ('one_hot_encoder_', OrdinalEncoder(cols=binary_cols+cat_cols)),
    ('scaler_', CustomScaler(list(num_cols))),
    #('scaler_', StandardScaler()),
    ('model_', LogisticRegression())
    ])

In [None]:
p13.fit(X_train, y_train)

Pipeline(steps=[('one_hot_encoder_',
                 OrdinalEncoder(cols=['Make', 'Model', 'Style', 'Fuel_type'],
                                mapping=[{'col': 'Make',
                                          'data_type': dtype('O'),
                                          'mapping': Mercedes           1
Vaz                2
Hyundai            3
Toyota             4
Renault            5
Audi               6
Honda              7
Skoda              8
Dacia              9
Volkswagen        10
Nissan            11
Chrysler          12
BMW               13
Infiniti          14
Opel              15
Land Rover        16
Jaguar            17
Mini              18
Fiat              19
Mazda             20
Lexus             21
Porsche           22
Rover             23
Ford              24
Volvo             25
Seat              26
Chevrolet         27...
                                          'data_type': dtype('O'),
                                          'mapping': Sedan         1
Un

In [None]:
accuracies4 = {
    'train accuracy': p13.score(X_train, y_train),
    'test accuracy': p13.score(X_test, y_test)
}

print(*accuracies4.items())

('train accuracy', 0.8192059862814384) ('test accuracy', 0.8259134555430852)


In [None]:
p13['model_'].coef_[0]

array([-2.87673992e-03,  2.86362553e-04, -3.70242586e-02,  2.04490062e-01,
        1.09571363e+00,  5.09054709e-01,  1.29768490e+00, -7.98824444e-01,
       -2.91570926e-01, -4.36316652e-01])

In [None]:
df_coefficients=pd.DataFrame(
    {
        'feature': X_train.columns,
        'coefficient': p13['model_'].coef_[0]*10
    }
).sort_values(by=['coefficient'])
df_coefficients

Unnamed: 0,feature,coefficient
7,Age,-7.988244
9,prestige,-4.363167
8,km_year,-2.915709
2,Style,-0.370243
0,Make,-0.028767
1,Model,0.002864
3,Distance,2.044901
5,Fuel_type,5.090547
4,Engine_capacity(cm3),10.957136
6,Price(euro),12.976849


теперь выбираем самый оптимальный вариант

---



In [None]:
p14 = Pipeline([
    ('one_hot_encoder_', OneHotEncoder(cols=binary_cols+cat_cols)),
    ('scaler_', CustomScaler(list(num_cols))),
    #('scaler_', StandardScaler()),
    ('mult_lr', LogisticRegression(solver='saga'))
    ])

In [None]:
p14.fit(X_train, y_train)
accuracies5 = {
    'train accuracy': p14.score(X_train, y_train),
    'test accuracy': p14.score(X_test, y_test)
}

print(*accuracies5.items())

('train accuracy', 0.8552899605071711) ('test accuracy', 0.8590846738994887)


In [None]:
params = {
    'mult_lr__C': [1.0, 1e-1, 1e-2, 1e-3],
    'mult_lr__penalty': ['l1', 'l2'],
    'mult_lr__l1_ratio': [0.10, 0.25, 0.5, 0.75]
}

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
grid_mult_pipe0 = GridSearchCV(
    p14,
    params,
    cv=3
)

grid_mult_pipe0.fit(X_train, y_train)

GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('one_hot_encoder_',
                                        OneHotEncoder(cols=['Make', 'Model',
                                                            'Style',
                                                            'Fuel_type'])),
                                       ('scaler_',
                                        <__main__.CustomScaler object at 0x7f2cd17f4ca0>),
                                       ('mult_lr',
                                        LogisticRegression(solver='saga'))]),
             param_grid={'mult_lr__C': [1.0, 0.1, 0.01, 0.001],
                         'mult_lr__l1_ratio': [0.1, 0.25, 0.5, 0.75],
                         'mult_lr__penalty': ['l1', 'l2']})

самые оптимальные параметры

In [None]:
print(grid_mult_pipe0.best_params_)

{'mult_lr__C': 1.0, 'mult_lr__l1_ratio': 0.1, 'mult_lr__penalty': 'l1'}


In [None]:
print(grid_mult_pipe0.best_estimator_)

Pipeline(steps=[('one_hot_encoder_',
                 OneHotEncoder(cols=['Make', 'Model', 'Style', 'Fuel_type'])),
                ('scaler_', <__main__.CustomScaler object at 0x7f2cd19156a0>),
                ('mult_lr',
                 LogisticRegression(l1_ratio=0.1, penalty='l1',
                                    solver='saga'))])


In [None]:
grid_mult_pipe0.best_estimator_.get_params()

{'memory': None,
 'steps': [('one_hot_encoder_',
   OneHotEncoder(cols=['Make', 'Model', 'Style', 'Fuel_type'])),
  ('scaler_', <__main__.CustomScaler at 0x7f2cd19156a0>),
  ('mult_lr', LogisticRegression(l1_ratio=0.1, penalty='l1', solver='saga'))],
 'verbose': False,
 'one_hot_encoder_': OneHotEncoder(cols=['Make', 'Model', 'Style', 'Fuel_type']),
 'scaler_': <__main__.CustomScaler at 0x7f2cd19156a0>,
 'mult_lr': LogisticRegression(l1_ratio=0.1, penalty='l1', solver='saga'),
 'one_hot_encoder___cols': ['Make', 'Model', 'Style', 'Fuel_type'],
 'one_hot_encoder___drop_invariant': False,
 'one_hot_encoder___handle_missing': 'value',
 'one_hot_encoder___handle_unknown': 'value',
 'one_hot_encoder___return_df': True,
 'one_hot_encoder___use_cat_names': False,
 'one_hot_encoder___verbose': 0,
 'mult_lr__C': 1.0,
 'mult_lr__class_weight': None,
 'mult_lr__dual': False,
 'mult_lr__fit_intercept': True,
 'mult_lr__intercept_scaling': 1,
 'mult_lr__l1_ratio': 0.1,
 'mult_lr__max_iter': 100,
 '

In [None]:
accuracies6 = {
    'train accuracy': grid_mult_pipe0.score(X_train, y_train),
    'test accuracy': grid_mult_pipe0.score(X_test, y_test)
}

print(*accuracies6.items())

('train accuracy', 0.8552483891082935) ('test accuracy', 0.8584611547574511)


In [None]:
y0_train = (y_train == 0)
y1_train = (y_train == 1)

In [None]:
def onevsall3(X_train, y_train, X_test, y_test):

    y0_train = (y_train == 0)
    y1_train = (y_train == 1)
    

    mult_pipe0 = Pipeline([
    ('one_hot_encoder_', OneHotEncoder(cols=binary_cols+cat_cols)),
    ('scaler_', CustomScaler(list(num_cols))),
    #('scaler_', StandardScaler()),
    ('mult_lr', LogisticRegression(solver='saga'))
    ])

    params = {
        'mult_lr__C': [1.0, 1e-1, 1e-2, 1e-3],
        'mult_lr__penalty': ['l1', 'l2'],
        'mult_lr__l1_ratio': [0.10, 0.25, 0.5, 0.75]
    }

    grid_mult_pipe0 = GridSearchCV(
        mult_pipe0,
        params,
        cv=3
    )

    mult_pipe1 = Pipeline([
    ('one_hot_encoder_', OneHotEncoder(cols=binary_cols+cat_cols)),
    ('scaler_', CustomScaler(list(num_cols))),
    #('scaler_', StandardScaler()),
    ('mult_lr', LogisticRegression(solver='saga'))
    ])

    grid_mult_pipe1 = GridSearchCV(
        mult_pipe1,
        params,
        cv=3
    )


    grid_mult_pipe0.fit(X_train, y0_train)
    grid_mult_pipe1.fit(X_train, y1_train)
    

    predict_proba_pipe0 = grid_mult_pipe0.best_estimator_.predict_proba(X_test)
    predict_proba_pipe1 = grid_mult_pipe1.best_estimator_.predict_proba(X_test)
    
    df_result_proba_test = pd.DataFrame(
    {
        'class_0': predict_proba_pipe0[:, 1],
        'class_1': predict_proba_pipe1[:, 1]
    })

    return (
            df_result_proba_test.idxmax(axis=1)
            .replace({
                'class_0': 0,
                'class_1': 1
                
            }))

In [None]:
test_predict_onevsall = onevsall3(X_train, y_train, X_test, y_test)

In [None]:
print(classification_report(y_test, test_predict_onevsall))

              precision    recall  f1-score   support

           0       0.85      0.90      0.87      4374
           1       0.87      0.81      0.84      3645

    accuracy                           0.86      8019
   macro avg       0.86      0.85      0.86      8019
weighted avg       0.86      0.86      0.86      8019

