In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
import seaborn as sns
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv('spam7.csv')

In [None]:
df

In [None]:
df['spam'] = df['yesno'].apply(lambda x:1 if x=='y' else 0)
df = df.drop(['Unnamed: 0','yesno'], axis=1)

In [None]:
X = df.drop('spam', axis=1)
y = df[['spam']]

In [None]:
cols = X.columns.to_list()

In [None]:
for num,col_1 in enumerate(cols):          
    for col_2 in cols[num:]:
        if col_1 == col_2:
            continue
        else:            
            X[col_1+'_'+col_2] = X[col_1]*X[col_2]
    
    

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2,random_state=42)

In [None]:
boost = GradientBoostingClassifier(learning_rate=0.1, n_estimators=100,max_depth=3, min_samples_split=2, min_samples_leaf=1, subsample=1,max_features=None)

In [None]:
boost.fit(X_train, y_train)


In [None]:
score = accuracy_score(y_test, boost.predict(X_test))

In [None]:
round(score,3)

In [None]:
importances = boost.feature_importances_

In [None]:
plt.figure(figsize=[30, 9])
sns.barplot(X.columns, importances,)

In [None]:
param_grid = {'learning_rate':[0.00001, 0.0001, 0.001, 0.01, 0.1, 1], 
              'n_estimators':[100, 250, 500, 750, 1000, 1250, 1500, 1750]}

In [None]:
grid = GridSearchCV(estimator=boost, param_grid=param_grid,scoring='accuracy', n_jobs=-1,cv=5)

In [None]:
grid.fit(X_train, y_train)
# accuracy_score(y_test, grid.predict(X_test))

In [None]:
grid.best_params_

In [None]:
param_grid = {'max_depth':[x for x in range(5,16)]}

In [None]:
grid = GridSearchCV(estimator=boost, param_grid=param_grid,scoring='accuracy', n_jobs=-1,cv=5)

In [None]:
grid.fit(X_train, y_train)

In [None]:
grid.best_params_

In [None]:
X

In [None]:
def AdaBoost_scratch(X, y, M=10, learning_rate=1):
    # инициалиазция служебных переменных
    N = len(y)
    estimator_list, y_predict_list, estimator_error_list, estimator_weight_list, sample_weight_list = [], [], [], [], []
    
    # инициализация весов
    sample_weight = np.ones(N)/N
    sample_weight_list.append(sample_weight.copy())
    # цикл по длине М
    
    for m in range(M):
        # обучим базовую модель и получим предсказание
        estimator = DecisionTreeClassifier(max_depth=1,max_leaf_nodes=2)
        estimator.fit(X,y,sample_weight=sample_weight)
        y_predict = estimator.predict(X)
        
        # Маска для ошибок классификации
        incorrect = (y_predict != y)  
        # Оцениваем ошибку
        estimator_error = np.sum(sample_weight*incorrect)
        
        # Вычисляем вес нового алгоритма
        estimator_weight =  learning_rate * np.log((1-estimator_error)/estimator_error)
        #Получаем новые веса объектов
        sample_weight *= np.exp(estimator_weight * incorrect *((sample_weight>0) | (estimator_weight<0))) 
        
        # Сохраяем результаты данной итерации
        estimator_list.append(estimator)
        y_predict_list.append(y_predict.copy())
        estimator_error_list.append(estimator_error.copy())
        estimator_weight_list.append(estimator_weight.copy())
        sample_weight_list.append(sample_weight.copy())
    
        # Для удобства переведем в numpy.array   
    estimator_list = np.asarray(estimator_list)
    y_predict_list = np.asarray(y_predict_list)
    estimator_error_list = np.asarray(estimator_error_list)
    estimator_weight_list = np.asarray(estimator_weight_list)
    sample_weight_list = np.asarray(sample_weight_list)

    # Получим предсказания
    preds = (np.array([np.sign((y_predict_list[:,point] * estimator_weight_list).sum()) for point in range(N)]))
    print('Accuracy = ', (preds == y).sum() / N) 
    
    return estimator_list, estimator_weight_list, sample_weight_list

In [None]:
estimator_list, estimator_weight_list, sample_weight_list  = AdaBoost_scratch(X, y, M=10, learning_rate=0.001)

In [None]:
y.values.T