# Рекомендация тарифов

В нашем распоряжении данные о поведении клиентов, которые уже перешли на тарифы из проекта курса «Статистический анализ данных». Нужно построить модель для задачи классификации, которая выберет подходящий тариф. Предобработка данных не понадобится — она уже сделана.

Построим модель с максимально большим значением *accuracy*. Успехом проекта будет считаться доля правильных ответов по крайней мере до 0.75. Проверим *accuracy* на тестовой выборке самостоятельно.

### Описание данных

    сalls — количество звонков,
    minutes — суммарная длительность звонков в минутах,
    messages — количество sms-сообщений,
    mb_used — израсходованный интернет-трафик в Мб,
    is_ultra — каким тарифом пользовался в течение месяца («Ультра» — 1, «Смарт» — 0).

## 1. Откроем и изучим файл

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

df = pd.read_csv('/datasets/users_behavior.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3214 entries, 0 to 3213
Data columns (total 5 columns):
calls       3214 non-null float64
minutes     3214 non-null float64
messages    3214 non-null float64
mb_used     3214 non-null float64
is_ultra    3214 non-null int64
dtypes: float64(4), int64(1)
memory usage: 125.7 KB


In [2]:
df.head()

Unnamed: 0,calls,minutes,messages,mb_used,is_ultra
0,40.0,311.9,83.0,19915.42,0
1,85.0,516.75,56.0,22696.96,0
2,77.0,467.66,86.0,21060.45,0
3,106.0,745.53,81.0,8437.39,1
4,66.0,418.74,1.0,14502.75,0


### Вывод

Предобработка выполнена. Логика данных понятна.

## 2. Разобьем данные на выборки

In [3]:
# Тестовой выборки у нас нет, значит спрячем часть данных в тестовую выборку.
df_train, df_test = train_test_split(df, test_size=0.20, random_state=12)

In [4]:
# Создаём df для обучения модели.
df_train, df_valid = train_test_split(df_train, test_size=0.25, random_state=12)

In [5]:
features_train = df_train.drop('is_ultra', axis=1)
target_train = df_train['is_ultra']

features_valid = df_valid.drop('is_ultra', axis=1)
target_valid = df_valid['is_ultra']

features_test = df_test.drop('is_ultra', axis=1)
target_test = df_test['is_ultra']

### Вывод

Разобьём все данные в примерном соотношении 60-20-20 на обучающие, валидационные и тестовые.

## 3. Исследуем модели

In [6]:
from sklearn.tree import DecisionTreeClassifier

scores = []

splitter = ['best', 'random']

def dtc(max_depth, splitter):
    for depth in range(1, max_depth):
        for mss in np.arange(0.1, 1, 0.1):
            model = DecisionTreeClassifier(splitter=splitter, max_depth=depth, min_samples_split=mss, random_state=12)
            model.fit(features_train, target_train)
            predictions_valid = model.predict(features_valid)
            acc = accuracy_score(target_valid, predictions_valid)
            scores.append(
                ({'splitter':splitter , 
                  'depth':depth, 
                  'mss':mss, 
                  'accuracy_score': acc}))

for spl in splitter:
    dtc(15, spl)

scores = pd.DataFrame(scores)
scores_dtc = scores[scores['accuracy_score'] == scores['accuracy_score'].max()]
scores_dtc

Unnamed: 0,splitter,depth,mss,accuracy_score
27,best,4,0.1,0.788491
28,best,4,0.2,0.788491
29,best,4,0.3,0.788491
30,best,4,0.4,0.788491
31,best,4,0.5,0.788491
...,...,...,...,...
120,best,14,0.4,0.788491
121,best,14,0.5,0.788491
122,best,14,0.6,0.788491
123,best,14,0.7,0.788491


In [7]:
from sklearn.ensemble import RandomForestClassifier

scores = []

for depth in range(1, 5):
    for n in range(10, 100, 10):
        for mss in np.arange(0.1, 0.5, 0.1):
            model = RandomForestClassifier(n_estimators=n, max_depth=depth, min_samples_split=mss, random_state=12)
            model.fit(features_train, target_train)
            predictions_valid = model.predict(features_valid)
            acc = accuracy_score(target_valid, predictions_valid)
            scores.append(
                ({'depth':depth,
                  'mss':mss,
                  'n_estimators':n,
                  'accuracy_score': acc}))

scores = pd.DataFrame(scores)

scores_rfc = scores[scores['accuracy_score'] == scores['accuracy_score'].max()]
scores_rfc

Unnamed: 0,depth,mss,n_estimators,accuracy_score
108,4,0.1,10,0.797823
109,4,0.2,10,0.797823
110,4,0.3,10,0.797823
111,4,0.4,10,0.797823


In [8]:
from sklearn.neighbors import KNeighborsClassifier

scores = []

weights = ['uniform', 'distance']

for weight in weights:
    for n in range(1, 20):
        model = KNeighborsClassifier(n_neighbors=n, weights=weight)
        model.fit(features_train, target_train)
        predictions_valid = model.predict(features_valid)
        acc = accuracy_score(target_valid, predictions_valid)

        scores.append({'n_neighbors':n,'weight':weight, 'accuracy_score': acc})
    
scores = pd.DataFrame(scores)

scores_knc = scores[scores['accuracy_score'] == scores['accuracy_score'].max()]
scores_knc

Unnamed: 0,n_neighbors,weight,accuracy_score
31,13,distance,0.77605


In [9]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(random_state=12, solver='newton-cg')
model.fit(features_train, target_train)
predictions_valid = model.predict(features_valid)
scores_lr = accuracy_score(target_valid, predictions_valid)

scores_lr



0.749611197511664

In [10]:
from sklearn.naive_bayes import GaussianNB

model = GaussianNB()
model.fit(features_train, target_train)
predictions_valid = model.predict(features_valid)
scores_gnb = accuracy_score(target_valid, predictions_valid)

scores_gnb

0.7744945567651633

In [11]:
from sklearn.ensemble import AdaBoostClassifier

scores = []

for n in range(10, 150, 10):
    model = AdaBoostClassifier(n_estimators=n, random_state=12)
    model.fit(features_train, target_train)
    predictions_valid = model.predict(features_valid)
    acc = accuracy_score(target_valid, predictions_valid)

    scores.append({'n_estimators':n, 'accuracy_score': acc})

scores = pd.DataFrame(scores)

scores_abc = scores[scores['accuracy_score'] == scores['accuracy_score'].max()]
scores_abc

Unnamed: 0,n_estimators,accuracy_score
1,20,0.794712


### Вывод

Было интересно, как отработают разные модели с разными параметрами.<br>
Видим, что модели показывают примерно схожие результаты.<br>
LogisticRegression показывает более низкие результаты.<br>
Нужно уточнить, что некоторые параметры я отбросил, они не влияли на результат, но требовали вычислительных ресурсов.

## 4. Проверим модели на тестовой выборке

In [12]:
model = DecisionTreeClassifier(criterion='entropy', splitter='best', max_depth=14, min_samples_split=0.1, random_state=12)
model.fit(features_train, target_train)
predictions_test = model.predict(features_test)
accuracy_score(target_test, predictions_test)

0.7511664074650077

In [13]:
model = RandomForestClassifier(n_estimators=10, max_depth=4, min_samples_split=0.3, random_state=12)
model.fit(features_train, target_train)
predictions_test = model.predict(features_test)
accuracy_score(target_test, predictions_test)

0.7480559875583204

In [14]:
model = KNeighborsClassifier(n_neighbors=13, weights='distance')
model.fit(features_train, target_train)
predictions_test = model.predict(features_test)
accuracy_score(target_test, predictions_test)

0.7247278382581649

In [15]:
model = GaussianNB()
model.fit(features_train, target_train)
predictions_test = model.predict(features_test)
accuracy_score(target_test, predictions_test)

0.7433903576982893

In [16]:
model = AdaBoostClassifier(n_estimators=20, random_state=12)
model.fit(features_train, target_train)
predictions_test = model.predict(features_test)
accuracy_score(target_test, predictions_test)

0.7465007776049767

### Вывод

На тестовой выборке лучший результат показала модель DecisionTreeClassifier.<br>
Точность > 0,75 выполняется у DecisionTreeClassifier. 

## 5. Проверим модели на адекватность

In [17]:
# Разобьём данные на новые датасеты с новым сид. Таким образом данные перемешаются.

df_train, df_test = train_test_split(df, test_size=0.20, random_state=42)
df_train, df_valid = train_test_split(df_train, test_size=0.25, random_state=42)

features_train = df_train.drop('is_ultra', axis=1)
target_train = df_train['is_ultra']

features_valid = df_valid.drop('is_ultra', axis=1)
target_valid = df_valid['is_ultra']

features_test = df_test.drop('is_ultra', axis=1)
target_test = df_test['is_ultra']

In [18]:
model = DecisionTreeClassifier(criterion='entropy', splitter='best', max_depth=14, min_samples_split=0.1, random_state=12)
model.fit(features_train, target_train)
predictions_test = model.predict(features_test)
accuracy_score(target_test, predictions_test)

0.7931570762052877

In [19]:
model = RandomForestClassifier(n_estimators=10, max_depth=4, min_samples_split=0.3, random_state=12)
model.fit(features_train, target_train)
predictions_test = model.predict(features_test)
accuracy_score(target_test, predictions_test)

0.80248833592535

In [20]:
model = GaussianNB()
model.fit(features_train, target_train)
predictions_test = model.predict(features_test)
accuracy_score(target_test, predictions_test)

0.7807153965785381

In [21]:
model = AdaBoostClassifier(n_estimators=20, random_state=12)
model.fit(features_train, target_train)
predictions_test = model.predict(features_test)
accuracy_score(target_test, predictions_test)

0.7807153965785381

In [22]:
df_train, df_test = train_test_split(df, test_size=0.20, random_state=12345)
df_train, df_valid = train_test_split(df_train, test_size=0.25, random_state=12345)

features_train = df_train.drop('is_ultra', axis=1)
target_train = df_train['is_ultra']

features_valid = df_valid.drop('is_ultra', axis=1)
target_valid = df_valid['is_ultra']

features_test = df_test.drop('is_ultra', axis=1)
target_test = df_test['is_ultra']

In [23]:
model = DecisionTreeClassifier(criterion='entropy', splitter='best', max_depth=14, min_samples_split=0.1, random_state=12)
model.fit(features_train, target_train)
predictions_test = model.predict(features_test)
accuracy_score(target_test, predictions_test)

0.7962674961119751

In [24]:
model = RandomForestClassifier(n_estimators=10, max_depth=4, min_samples_split=0.3, random_state=12)
model.fit(features_train, target_train)
predictions_test = model.predict(features_test)
accuracy_score(target_test, predictions_test)

0.7900466562986003

In [25]:
model = GaussianNB()
model.fit(features_train, target_train)
predictions_test = model.predict(features_test)
accuracy_score(target_test, predictions_test)

0.7993779160186625

In [26]:
model = AdaBoostClassifier(n_estimators=20, random_state=12)
model.fit(features_train, target_train)
predictions_test = model.predict(features_test)
accuracy_score(target_test, predictions_test)

0.7884914463452566

### Вывод

Можно увидеть, что все модели показывают приемлемую точность для задания.<br>
DecisionTreeClassifier показывает более стабильные результаты. Применительно к этой задаче, я бы выбрал именно её.