In [1]:
#imports
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error

# Data Preparation

In [2]:
#read file
df = pd.read_csv('/datasets/users_behavior.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3214 entries, 0 to 3213
Data columns (total 5 columns):
calls       3214 non-null float64
minutes     3214 non-null float64
messages    3214 non-null float64
mb_used     3214 non-null float64
is_ultra    3214 non-null int64
dtypes: float64(4), int64(1)
memory usage: 125.7 KB


In [4]:
df.head(10)

Unnamed: 0,calls,minutes,messages,mb_used,is_ultra
0,40.0,311.9,83.0,19915.42,0
1,85.0,516.75,56.0,22696.96,0
2,77.0,467.66,86.0,21060.45,0
3,106.0,745.53,81.0,8437.39,1
4,66.0,418.74,1.0,14502.75,0
5,58.0,344.56,21.0,15823.37,0
6,57.0,431.64,20.0,3738.9,1
7,15.0,132.4,6.0,21911.6,0
8,7.0,43.39,3.0,2538.67,1
9,90.0,665.41,38.0,17358.61,0


In [5]:
#viewing some statistics
print('smart:',round(df[df['is_ultra']==0].shape[0]/df.shape[0],2), \
      '\nultra:', round(df[df['is_ultra']==1].shape[0]/df.shape[0],2),'\n')
stat={}
for plan,mask in zip(['smart','ultra'],[df.eval('is_ultra == 0'),df.eval('is_ultra == 1')]):
    stat[plan]=pd.concat([df.loc[mask,:].median(), \
                        df.loc[mask,:].mean(), \
                        df.loc[mask,:].std(), \
                        df.loc[mask,:].min(), \
                        df.loc[mask,:].max()], axis=1)
    stat[plan].columns=['median','mean','std','min','max']
    stat[plan]=stat[plan].round(1)
    print(plan , 'statistics\n',stat[plan],'\n\n') 

smart: 0.69 
ultra: 0.31 

smart statistics
            median     mean     std  min      max
calls        60.0     58.5    25.9  0.0    198.0
minutes     410.6    405.9   184.5  0.0   1390.2
messages     28.0     33.4    28.2  0.0    143.0
mb_used   16506.9  16208.5  5870.5  0.0  38552.6
is_ultra      0.0      0.0     0.0  0.0      0.0 


ultra statistics
            median     mean      std  min      max
calls        74.0     73.4     43.9  0.0    244.0
minutes     502.6    511.2    308.0  0.0   1632.1
messages     38.0     49.4     47.8  0.0    224.0
mb_used   19308.0  19468.8  10087.2  0.0  49745.7
is_ultra      1.0      1.0      0.0  1.0      1.0 




In [3]:
#splitting data to train set (60%), validation (20%) and test (20%) sets 
sets = {}
sets['train'], df_val_test = train_test_split(df, test_size=0.4, random_state=12345)
sets['valid'], sets['test'] = train_test_split(df_val_test, test_size=0.5, random_state=12345)

In [4]:
#defining features and tagets

sets_parts = {}
for set_name in sets.keys():
    sets_parts[set_name + '_' + 'features'] = sets[set_name].drop(['is_ultra'], axis=1)
    sets_parts[set_name + '_' + 'target'] = sets[set_name]['is_ultra']

# Training models

## Training with Decision Tree

In [6]:
#Decision Tree trainings

acc_DT = {}
mean_DT = {}
for max_depth in range(1,6):
    model = DecisionTreeClassifier(random_state=12340+max_depth, max_depth=max_depth)
    model.fit(sets_parts['train_features'], sets_parts['train_target'])
    #predict on validation set
    predictions = model.predict(sets_parts['valid_features'])
    acc_DT[max_depth]=(accuracy_score(sets_parts['valid_target'], predictions))
    mean_DT[max_depth]=predictions.mean()
#    acc_DT[max_depth]=model.score(sets_parts['valid_features'],sets_parts['valid_target'])

In [7]:
max_depth = max(acc_DT, key=acc_DT.get)
print('max accuracy: ', acc_DT[max_depth],' max_depth: ', max_depth, ' mean predictions: ', mean_DT[max_depth])

max accuracy:  0.7853810264385692  max_depth:  3  mean predictions:  0.17884914463452567


## Training with Random Forest

In [8]:
#Random Forest trainings

acc_RF = {}
mean_RF = {}
for n_estimators in range(4,16):
    model = RandomForestClassifier(random_state=12340+n_estimators, n_estimators=n_estimators)
    model.fit(sets_parts['train_features'], sets_parts['train_target'])
    #predict on validation set
    predictions = model.predict(sets_parts['valid_features'])
    acc_RF[n_estimators]=(accuracy_score(sets_parts['valid_target'], predictions))
    mean_RF[n_estimators]=predictions.mean()
#    acc_RF[n_estimators]=model.score(sets_parts['valid_features'],sets_parts['valid_target'])

In [9]:
n_estimators = max(acc_RF, key=acc_RF.get)
print('max accuracy: ', acc_RF[n_estimators],'n_estimators: ', n_estimators, ' mean predictions: ', mean_RF[n_estimators])

max accuracy:  0.7900466562986003 n_estimators:  10  mean predictions:  0.19906687402799378


## Training with Logistic Regression

In [10]:
#Logistic Regression training

model = LogisticRegression(random_state=12345)
model.fit(sets_parts['train_features'], sets_parts['train_target'])
predictions = model.predict(sets_parts['valid_features'])
acc_LR = (accuracy_score(sets_parts['valid_target'], predictions))
mean_LR = predictions.mean()
#model.score(sets_parts['valid_features'],sets_parts['valid_target'])
print('accuracy: ', acc_LR, ' mean predictions: ', mean_LR)

accuracy:  0.7589424572317263  mean predictions:  0.08087091757387248




The best trained model is Random Forest with  n_estimators = 10, accuracy = 0.79 and predictions mean = 0.2

## Sanity Check

In [11]:
np.mean([sets['train']['is_ultra'].mean(), sets['valid']['is_ultra'].mean()])

0.30075364764492174

30% of the set is classified as '1'. A constant prediction of zeros would result in an accuracy score of 0.7

In [15]:
#Checking accuracy score for a prediction of 0's
random_pred = pd.Series(0,sets_parts['valid_target'].index)
accuracy_score(sets_parts['valid_target'], random_pred)

0.7060653188180405

In [34]:
#training random forest with n=10 multiple times to check avarage score and best random seed
mean_score = 0.79
max_score = 0.79
i_max_score = 10
for i in range(0,50):
    model = RandomForestClassifier(random_state=12340+i, n_estimators=10)
    model.fit(sets_parts['train_features'], sets_parts['train_target'])
    predictions = model.predict(sets_parts['valid_features'])
    mean_predictions = np.mean([mean_predictions, predictions.mean()])
    score = accuracy_score(sets_parts['valid_target'], predictions)
    mean_score = np.mean([mean_score, score])
    if score > max_score:
        max_score = score
        i_max_score = i
print('Mean predictions: ',mean_predictions, 'Mean score: ', mean_score, 'random seed: ',12340+i_max_score )

Mean predictions:  0.21162827617419722 Mean score:  0.7809241263633655 random seed:  12374


# Training final model and testing on test set

In [39]:
#final model
final_model = RandomForestClassifier(random_state=12340+i_max_score, n_estimators=10)
features = pd.concat([sets_parts['train_features'], sets_parts['valid_features']], ignore_index=True)
target = pd.concat([sets_parts['train_target'], sets_parts['valid_target']], ignore_index=True)
final_model.fit(features, target)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=12374,
                       verbose=0, warm_start=False)

In [40]:
#Testing the model on test set
predictions = final_model.predict(sets_parts['test_features'])
print('Mean predictions: ',predictions.mean(), 'Score: ', accuracy_score(sets_parts['test_target'], predictions) )

Mean predictions:  0.23483670295489892 Score:  0.7947122861586314


Conclusion:
- Best model is Random Forest with n = 10 
- Achieved a score of 0.794, compared with a dummy score of 0.7
- Achieved predictions ratio of 0.234, compared with 0.3 true ratio in training set 