In [194]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder

In [195]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [196]:
class Information():
    def __init__(self):
        print('----information------')
        
    def info(self, data):
        features = data.columns.values
        dtypes = data.dtypes
        missing_values = data.isna().sum()
        print('{:20}{:20}{:20}{:15}'.format('features'.upper(), 'dtypes'.upper(), 'missing_values'.upper(), 
                              '#samples'.upper() ))
        for feature, dtype,nas in zip(features, dtypes, missing_values):
            print("{:20}{:20}{:20}".format(feature, str(dtype), str(nas)), end='')
            for v in data[feature][:3]:
                print(v, end=',')
            print()
        print()
                
        

In [197]:
class Data_Process_Strategy():
    def __init__(self):
        print('----data process strategy----')
    def fillna(self, data):
        data['Age'] = data['Age'].fillna(data['Age'].median())
        data['Embarked'] = data['Embarked'].fillna(data['Embarked'].mode()[0])
        data['Fare'] = data['Fare'].fillna(data['Fare'].mean())
        return data
    
    def drop_cols(self, data, drop_list):
        return data.drop(drop_list, axis=1)
        
    def feature_engineering(self, data):
        data['family_size'] = data['SibSp'] + data['Parch'] + 1
        data['is_along'] = data['family_size'].apply(lambda x: 1 if x==1 else 0)
        data['title'] = data['Name'].str.split(',', expand=True)[1].str.split('.', expand = True)[0]
        d= data['title'].value_counts()>10
        data['title'].apply(lambda x: 'Misc' if d[x] else x)
        data['Age_grs'] = pd.cut(data['Age'],5)
        data['Fre_grs'] = pd.cut(data['Fare'], 5)
        data = self.drop_cols(data, ['Age', 'Fare', 'Name'])
        return data
    def ohe(self, data, preferred_cols=None):
        ohe = OneHotEncoder(handle_unknown='ignore')
        ohe_cols = list(data.select_dtypes('O').columns.values)
        non_ohe_cols = [i for i in data.columns if i not in ohe_cols]
        if preferred_cols is not None:
            ohe_cols = preferred_cols
            data_ohe = pd.DataFrame(ohe.fit_transform(data[preferred_cols]).toarray(), 
                                    columns=ohe.get_feature_names_out() )
            data_ohe.index = data.index
            data_with_ohe = pd.concat([data[non_ohe_cols], data_ohe], axis=1)
        else:
            ohe_cols = data.columns.values
            data_ohe = pd.DataFrame(ohe.fit_transform(data[ohe_cols]).toarray(),
                                    columns= ohe.get_feature_names_out() )
            data_ohe.index = data.index
            data_with_ohe = data_ohe
        return data_with_ohe
        

In [182]:
class Data_Process_Execution():
    def __init__(self):
        print('----execution of data processing---')
        self._preprocess = Data_Process_Strategy()
        
    def preprocessing(self, data):
        self.data = data
        self.data = self._preprocess.fillna(self.data)
        self.data = self._preprocess.feature_engineering(self.data)
        self.data = self._preprocess.drop_cols(self.data, ['Cabin', 'PassengerId', 'Ticket'])
        self.data = self._preprocess.ohe(self.data)
        return self.data
    

In [183]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier

In [184]:
class Grid_Search():
    def __init__(self):
        print('---grid search----')
        self.models_and_params =[]
        self._models_and_params()
    def _models_and_params(self):
        model = RandomForestClassifier()
        params = {
            'n_estimators':[50, 100,200], 'criterion':['gini', 'entropy','log_loss'], 
            'max_depth':[2,3,4,5], 'min_samples_split':[2,3,4,5,6], 
            'min_samples_leaf':[1,2,3,4] 
        }
        self.models_and_params.append((model, params))
        model = LogisticRegression()
        params = {
            'penalty': ['l1', 'l2', 'elasticnet', None], 
            'solver': ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga'], 
            'max_iter': [100,200, 50], 'multi_class': ['auto', 'ovr', 'multinomial'], 
       
        }
        self.models_and_params.append((model, params))
    def fit_predict(self, X_train, y_train):
        self.X_train = X_train
        self.y_train = y_train
        self.results={}
        for model, params in self.models_and_params:
            model_name = model.__class__.__name__
            grid = GridSearchCV(model, params, cv=2)
            grid.fit(self.X_train, self.y_train)
            scores = grid.score(self.X_train, self.y_train)
            self.results[model_name] = scores
            self.results['params'] = grid.best_params_
            
    def show_results(self):
#         self.fit_predict()
        print(self.results)

In [185]:
class OOP():
    def __init__(self, train, test):
        print('---oop----')
        self._information = Information()
        self._preprocessing = Data_Process_Execution()
        self._grid = Grid_Search()
        self.train = train.drop(['Survived'],axis=1)
        self.test = test
        self.y_train = train['Survived']
        self.all_data = pd.concat([self.train, self.test])
        self.number_of_train = self.train.shape[0]
        
    def preprocessing(self):
        self.all_data =  self._preprocessing.preprocessing(self.all_data)
        
    def machine_learning(self):
        self._get_train_test()
        self._grid.fit_predict(self.X_train, self.y_train)
    def information(self):
        return self._information.info(self.all_data)
        
    def _get_train_test(self):
        self.X_train = self.all_data[:self.number_of_train]
        self.X_test = self.all_data[self.number_of_train:]
        
    def results(self):
        self._grid.show_results()

In [186]:
o= OOP(train, test)

---oop----
----information------
----execution of data processing---
----data process strategy----
---grid search----


In [187]:
# o.information()

FEATURES            DTYPES              MISSING_VALUES      #SAMPLES       
PassengerId         int64               0                   1,2,3,
Pclass              int64               0                   3,1,3,
Name                object              0                   Braund, Mr. Owen Harris,Cumings, Mrs. John Bradley (Florence Briggs Thayer),Heikkinen, Miss. Laina,
Sex                 object              0                   male,female,female,
Age                 float64             263                 22.0,38.0,26.0,
SibSp               int64               0                   1,1,0,
Parch               int64               0                   0,0,0,
Ticket              object              0                   A/5 21171,PC 17599,STON/O2. 3101282,
Fare                float64             1                   7.25,71.2833,7.925,
Cabin               object              1014                nan,C85,nan,
Embarked            object              2                   S,C,S,



In [188]:
o.preprocessing()

In [190]:
# o.information()

In [191]:
o.machine_learning()

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
Further options are to use another solver or to avoid such situation in the first place. Possible remedies are removing collinear features of X or increasing the penalization strengths.
The original Linear Algebra message was:
Matrix is singular.
Fur

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
Further options are to use another solver or to avoid such situation in the first place. Possible remedies are removing collinear features of X or increasing the penalization strengths.
The original Linear Algebra message was:
Matrix is singular.
Further options are to use another solver or to avoid such situation in the first place. Possible remedies are removing collinear features of X or increasing the penalization strengths.
The original Linear Algebra message was:
Matrix is singular.
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https

Further options are to use another solver or to avoid such situation in the first place. Possible remedies are removing collinear features of X or increasing the penalization strengths.
The original Linear Algebra message was:
Matrix is singular.
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res)
Further options are to use another solver or to avoid such situation in the first place. Possible remedies are removing collinear features of X or increasing the penalization strengths.
The original Linear Algebra message was:
Matrix is singular.
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res)
STOP: TOTAL NO. of I

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
222 fits failed out of a total of 432.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the f



In [192]:
o.results()

{'RandomForestClassifier': 0.8327721661054994, 'params': {'max_iter': 200, 'multi_class': 'multinomial', 'penalty': None, 'solver': 'sag'}, 'LogisticRegression': 0.8327721661054994}
