In [35]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import BaggingClassifier, StackingClassifier, AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

In [2]:
df = pd.read_csv('users_behavior.csv')
df.head(5)

Unnamed: 0,calls,minutes,messages,mb_used,is_ultra
0,40.0,311.9,83.0,19915.42,0
1,85.0,516.75,56.0,22696.96,0
2,77.0,467.66,86.0,21060.45,0
3,106.0,745.53,81.0,8437.39,1
4,66.0,418.74,1.0,14502.75,0


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3214 entries, 0 to 3213
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   calls     3214 non-null   float64
 1   minutes   3214 non-null   float64
 2   messages  3214 non-null   float64
 3   mb_used   3214 non-null   float64
 4   is_ultra  3214 non-null   int64  
dtypes: float64(4), int64(1)
memory usage: 125.6 KB


In [4]:
target = df['is_ultra']
df = df.drop(['is_ultra'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(df, target, test_size=0.3, random_state=12)

In [5]:
lr = LogisticRegression()
lr.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [6]:
print(lr.score(X_test, y_test))

0.7067357512953368


**Stacking (стекинг)**

In [41]:
estimators = [('rf', RandomForestClassifier(n_estimators=10, random_state=42)),('svr', make_pipeline(StandardScaler(), LinearSVC(dual="auto", random_state=42)))]
clf = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression())

In [42]:
clf.fit(X_train, y_train).score(X_test, y_test)

0.7823834196891192

**Bagging (бэггинг)**

In [10]:
modelClf = BaggingClassifier(base_estimator=LogisticRegression(), n_estimators=20, random_state=12)

In [11]:
modelClf.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [12]:
print(modelClf.score(X_train, y_train))

0.7523343708314807


**Random Forest (случайный лес)**

In [13]:
clf = RandomForestClassifier(n_estimators=20, max_depth=6,
                             min_samples_split=2, random_state=0)

In [14]:
scores = cross_val_score(clf, X_train, y_train, cv=3)

In [15]:
scores.mean()

0.8096899569796766

In [16]:
clf.fit(X_train, y_train)

In [17]:
print(clf.score(X_train, y_train))

0.840373499333037


Подбор гиперпараметров для случайного леса (через цикл и кросс-валидацию)

In [18]:
best_score = 0
best_params = None

for max_depth in range(2, 10):
    for n_estimators in range(10, 31, 10):
        model_rfr = RandomForestClassifier(max_depth=max_depth, n_estimators=n_estimators, random_state=12345)
        score = cross_val_score(model_rfr, X_train, y_train, cv=3, n_jobs=-1).mean()
        if score > best_score:
            best_score = score
            best_params = {'max_depth': max_depth, 'n_estimators': n_estimators}

print('Лучшее значение accuracy для случайного леса: {} при значениях гиперпараметров: {}'.format(best_score, best_params))

Лучшее значение accuracy для случайного леса: 0.8172490728378579 при значениях гиперпараметров: {'max_depth': 9, 'n_estimators': 20}


**Boosting (бустинг)**

**Adaboost (адаптивный бустинг)**

In [19]:
modelClf = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=2), n_estimators=100, random_state=12)

In [20]:
modelClf.fit(X_train, y_train)



In [21]:
print(modelClf.score(X_test, y_test))

0.7637305699481866


**Gradient Boosting (Градиентный бустинг)**

In [22]:
clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,
                                 max_depth=1, random_state=0)

In [23]:
clf.fit(X_train, y_train)

In [24]:
print(clf.score(X_test, y_test))

0.7792746113989637


**Voting (простое усреднение)**

In [25]:
decisiontree = DecisionTreeClassifier(max_depth=2)

In [26]:
forest = RandomForestClassifier(n_estimators=20, max_depth=6,
                             min_samples_split=2, random_state=0)

In [27]:
ensemble=VotingClassifier(estimators=[('Decision Tree', decisiontree), ('Random Forest', forest)], 
                       voting='soft', weights=[1,1]).fit(X_train, y_train)

In [28]:
print(ensemble.score(X_test, y_test))

0.7803108808290156
