In [19]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
#from sklearn.model_selection import GridSearchCV

In [20]:
# Load the data
df = pd.read_csv('/datasets/users_behavior.csv')

#data_path = '/datasets/users_behavior.csv'
#users_behavior = pd.read_csv(data_path)

# Display basic information about the dataset
print(df.head())
print(df.info())
print(df.describe())

   calls  minutes  messages   mb_used  is_ultra
0   40.0   311.90      83.0  19915.42         0
1   85.0   516.75      56.0  22696.96         0
2   77.0   467.66      86.0  21060.45         0
3  106.0   745.53      81.0   8437.39         1
4   66.0   418.74       1.0  14502.75         0
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3214 entries, 0 to 3213
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   calls     3214 non-null   float64
 1   minutes   3214 non-null   float64
 2   messages  3214 non-null   float64
 3   mb_used   3214 non-null   float64
 4   is_ultra  3214 non-null   int64  
dtypes: float64(4), int64(1)
memory usage: 125.7 KB
None
             calls      minutes     messages       mb_used     is_ultra
count  3214.000000  3214.000000  3214.000000   3214.000000  3214.000000
mean     63.038892   438.208787    38.281269  17207.673836     0.306472
std      33.236368   234.569872    36.148326   7570.968246  

In [21]:
# Define features and target
X = df.drop('is_ultra', axis=1)
y = df['is_ultra']

# Split into training, validation, and test sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=12345)
X_valid, X_test, y_valid, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=12345)


In [22]:
models = {
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'Logistic Regression': LogisticRegression()
}

best_model = None
best_accuracy = 0

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_valid)
    accuracy = accuracy_score(y_valid, y_pred)
    print(f'{name} accuracy: {accuracy}')

    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_model = model

print(f'\nBest model: {best_model}')
print(f'Best accuracy of validation set: {best_accuracy}')


Decision Tree accuracy: 0.7200622083981337
Random Forest accuracy: 0.7884914463452566
Logistic Regression accuracy: 0.7107309486780715

Best model: RandomForestClassifier()
Best accuracy of validation set: 0.7884914463452566


In [30]:
# Hyperparameter Tuning for Decision Tree (using max_depth)
best_model_dt = None
best_accuracy_dt = 0

for max_depth in [None, 1, 5, 10]:  # Try different depths
    model_dt = DecisionTreeClassifier(max_depth=max_depth, random_state=54321)
    model_dt.fit(X_train, y_train)
    y_pred_dt = model_dt.predict(X_valid)
    accuracy_dt = accuracy_score(y_valid, y_pred_dt)
    print(f'Decision Tree accuracy (max_depth={max_depth}): {accuracy_dt}')

    if accuracy_dt > best_accuracy_dt:
        best_accuracy_dt = accuracy_dt
        best_model_dt = model_dt

print(f'\nBest Decision Tree model: {best_model_dt}')
print(f'Best Decision Tree accuracy: {best_accuracy_dt}')

Decision Tree accuracy (max_depth=None): 0.71850699844479
Decision Tree accuracy (max_depth=1): 0.7542768273716952
Decision Tree accuracy (max_depth=5): 0.7791601866251944
Decision Tree accuracy (max_depth=10): 0.776049766718507

Best Decision Tree model: DecisionTreeClassifier(max_depth=5, random_state=54321)
Best Decision Tree accuracy: 0.7791601866251944


Criterion:

- Gini Impurity measures how often a randomly chosen element from the set would be incorrectly labeled if it were randomly labeled according to the distribution of labels in the subset. Lower Gini impurity indicates a better split.

- Entropy measures the reduction in entropy (uncertainty) achieved by splitting the dataset on a particular feature. Higher information gain indicates a better split.

In [31]:
best_model_dt = None
best_accuracy_dt = 0

for max_depth in [None, 5, 10, 15]:
    for criterion in ['gini', 'entropy']:
        for min_samples_split in [2, 5, 10]:
            for min_samples_leaf in [1, 2, 4]:
                model_dt = DecisionTreeClassifier(
                    max_depth=max_depth, 
                    criterion=criterion, 
                    min_samples_split=min_samples_split, 
                    min_samples_leaf=min_samples_leaf,
                    random_state=12345)  
                model_dt.fit(X_train, y_train)
                y_pred_dt = model_dt.predict(X_valid)
                accuracy_dt = accuracy_score(y_valid, y_pred_dt)
                print(f'Decision Tree accuracy (max_depth={max_depth}, criterion={criterion}, min_samples_split={min_samples_split}, min_samples_leaf={min_samples_leaf}): {accuracy_dt}')

                if accuracy_dt > best_accuracy_dt:
                    best_accuracy_dt = accuracy_dt
                    best_model_dt = model_dt

print(f'\nBest Decision Tree model is: {best_model_dt}')
print(f'Best Decision Tree accuracy: {best_accuracy_dt}')

Decision Tree accuracy (max_depth=None, criterion=gini, min_samples_split=2, min_samples_leaf=1): 0.713841368584759
Decision Tree accuracy (max_depth=None, criterion=gini, min_samples_split=2, min_samples_leaf=2): 0.7371695178849145
Decision Tree accuracy (max_depth=None, criterion=gini, min_samples_split=2, min_samples_leaf=4): 0.7247278382581649
Decision Tree accuracy (max_depth=None, criterion=gini, min_samples_split=5, min_samples_leaf=1): 0.7402799377916018
Decision Tree accuracy (max_depth=None, criterion=gini, min_samples_split=5, min_samples_leaf=2): 0.7371695178849145
Decision Tree accuracy (max_depth=None, criterion=gini, min_samples_split=5, min_samples_leaf=4): 0.7247278382581649
Decision Tree accuracy (max_depth=None, criterion=gini, min_samples_split=10, min_samples_leaf=1): 0.7278382581648523
Decision Tree accuracy (max_depth=None, criterion=gini, min_samples_split=10, min_samples_leaf=2): 0.7371695178849145
Decision Tree accuracy (max_depth=None, criterion=gini, min_sam

In [32]:
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_valid)
    accuracy = accuracy_score(y_valid, y_pred)
    print(f'{name} accuracy: {accuracy}')

    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_model = model

print(f'\nBest model: {best_model}')
print(f'Best accuracy of validation set: {best_accuracy}')

Decision Tree accuracy: 0.7200622083981337
Random Forest accuracy: 0.7884914463452566
Logistic Regression accuracy: 0.7107309486780715

Best model: RandomForestClassifier()
Best accuracy of validation set: 0.7884914463452566


- the Random Forest model seems promising cause it has the highest accuracy on the validation set
- it suggests that the Random Forest algorithm is better at capturing the complex relationships between user behavior and plan preference compared to the other two models.

In [33]:
y_test_pred = best_model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f'Accuracy of test set: {test_accuracy}')

Accuracy of test set: 0.7900466562986003


In [34]:
# Sanity checks
# Calculate majority class (0 or 1) in training set
majority_class = y_train.mode()[0]

# Predict majority class for all test examples
y_pred_baseline = [majority_class] * len(y_test)

# Calculate baseline accuracy
baseline_accuracy = accuracy_score(y_test, y_pred_baseline)
print(f'Baseline accuracy: {baseline_accuracy}')


Baseline accuracy: 0.6842923794712286


In [35]:
importances = best_model.feature_importances_
feature_names = X.columns
for feature, importance in zip(feature_names, importances):
    print(f'{feature}: {importance}')

calls: 0.20046339993643733
minutes: 0.2416439836455758
messages: 0.20927234403125186
mb_used: 0.348620272386735


These results suggest that **call duration**(minutes) and **data usage**(mb_used) are the strongest predictors of which plan a user might prefer.  

In [36]:
# Create a DataFrame with predictions and actual values
results = pd.DataFrame({'actual': y_test, 'predicted': y_test_pred})

# Filter misclassified examples
misclassified = results[results['actual'] != results['predicted']]

# Examine misclassified examples
print(misclassified)

      actual  predicted
2498       0          1
1748       0          1
1077       1          0
791        1          0
2557       0          1
...      ...        ...
186        0          1
1763       1          0
2401       0          1
2928       1          0
2313       0          1

[135 rows x 2 columns]


This output shows 138 instances where the model's predictions (the predicted column) did not match the actual user plan choices (the actual column). This means the model misclassified these users.

- False positives (predicted 1 but actual is 0): 
the model predicted that the user would prefer the Ultra plan, but the user actually chose the Smart plan.  This could happen if the user's usage patterns (e.g., call duration, data) were similar to typical Ultra users, but they chose the cheaper Smart plan for some other reason.

- False negatives (predicted 0, but the actual number is 1): The model predicted that the user would prefer the Smart plan, but they actually chose the Ultra plan. This could happen if the user's usage was lower than typical Ultra users, but they still valued the additional features or peace of mind that the Ultra plan offered.


