In [1]:
import pandas as  pd

import dill

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import roc_auc_score, r2_score, mean_absolute_error

In [2]:
# подготовим данные
df=pd.read_csv("../data/Placement_Data_Full_Class.csv")
df.drop(['sl_no','salary'], axis=1, inplace=True)
df['status'].replace(('Placed', 'Not Placed'), (1, 0), inplace=True)

features = ['ssc_p', 'hsc_p', 'degree_p']
target = ['status']

X_train, X_test, y_train, y_test = train_test_split(df[features], df[target], test_size=0.33, random_state=10)
#save test
X_test.to_csv("../data/X_test.csv", index=None)
y_test.to_csv("../data/y_test.csv", index=None)
#save train
X_train.to_csv("../data/X_train.csv", index=None)
y_train.to_csv("../data/y_train.csv", index=None)

In [3]:
#соберем наш простой pipeline
 
class NumberSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on numeric columns in the data
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[[self.key]]
    
  
continuous_columns = features
final_transformers = list()
    
for cont_col in continuous_columns:
    cont_transformer = Pipeline([
                ('selector', NumberSelector(key=cont_col)),
                ('scaler', StandardScaler())
            ])
    final_transformers.append((cont_col, cont_transformer))
    
feats = FeatureUnion(final_transformers)

feature_processing = Pipeline([('feats', feats)])

In [4]:
model = Pipeline([
    ('features',feats),
    ('classifier', LinearRegression()),
])

#обучим наш пайплайн
model.fit(X_train, y_train)

Pipeline(steps=[('features',
                 FeatureUnion(transformer_list=[('ssc_p',
                                                 Pipeline(steps=[('selector',
                                                                  NumberSelector(key='ssc_p')),
                                                                 ('scaler',
                                                                  StandardScaler())])),
                                                ('hsc_p',
                                                 Pipeline(steps=[('selector',
                                                                  NumberSelector(key='hsc_p')),
                                                                 ('scaler',
                                                                  StandardScaler())])),
                                                ('degree_p',
                                                 Pipeline(steps=[('selector',
                                        

In [12]:
predictions =  model.predict(X_test)
print(roc_auc_score(y_test,predictions))
print(r2_score(y_pred=predictions, y_true=y_test))
print(mean_absolute_error(y_pred=predictions, y_true=y_test))

0.9008695652173913
0.4273160365841553
0.30305649962682124


In [6]:
with open("../app/models/pipeline.dill", "wb") as f:
    dill.dump(model, f)