In [2]:
import pandas as pd
import numpy as np
import sklearn 
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
from sklearn.model_selection import train_test_split
warnings.filterwarnings('ignore')
np.random.seed(23)
pd.set_option('display.max_columns', None)

In [3]:
mobile_modelling = pd.read_csv('../Data/Data_modelling/mobile_modelling.csv')

X = mobile_modelling.iloc[:, 0:-1]
y = mobile_modelling.iloc[:, -1]

X_train, X_val, y_train, y_val = train_test_split(
    X, y, stratify=y, test_size=0.3, random_state=42
)
X_val, X_test, y_val, y_test = train_test_split(
    X_val, y_val, stratify=y_val, test_size=0.3, random_state=42
)

mobile_df = pd.concat([X_train, y_train], axis=1)

print(mobile_df.shape)

(980, 21)


In [4]:
r = 6
c = 4

plt.figure(figsize=(13, 13))

for i, column in enumerate(train_df.columns):
    plt.subplot(r, c, i+1)
    dis_dist = sns.distplot(mobile_df[column], bins=20)
    dis_dist.set_title(f'Distribution of variable {column}')  

plt.tight_layout() 
plt.show()

NameError: name 'train_df' is not defined

<Figure size 1300x1300 with 0 Axes>

# 1. Feature engineering

### Missing values

We don't have a problem with missing values, but we have 2 columns in which we have records with the value of 0, what doesn't make sense:

The first one is **px_height**. There is 1 record with the value of 0. So we decided to remove it from our dataset.

The second one is **sc_w**. In this case we have 124 records with a value of 0. Removing this amount of records could have an influence on our data and predictions. This is why we decided to try to replace missing values with a mean value from this column in dataset.


In [None]:
print("px_height with value of 0:")
print((mobile_df['px_height'] == 0).sum())

print("sc_w with value of 0:")
print((mobile_df['sc_w'] == 0).sum())

px_height with value of 0:
1
sc_w with value of 0:
94


In [None]:
mean_sc_w = mobile_df[mobile_df['sc_w'] != 0]['sc_w'].mean()
mobile_df['sc_w'] = mobile_df['sc_w'].replace(0, mean_sc_w)

- **encoding categorical variables**

    In our dataset we don't have any categorical variables that need encoding.
- **transforming variables**

    In our opinion, there is no need to transform any of the variables.
- **scaling/standarisation variables**

    SPRAWDZIĆ DLA JAKICH MODELI MA SENS A DLA JAKICH NIE
- **outliers**

    After removing missing values, we don't observe any outliers in our dataset.


# 2. Preliminary modelling

- Dummy clasifier
- Decision Tree Classifier
- Logistic Regression
- SVC
- Random Forest Classifier

In [None]:
X_train = mobile_df.iloc[:, 0:-1]
y_train = mobile_df.iloc[:, -1]

### Random forest classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X, y)
importances = model.feature_importances_
features = X.columns
forest_importances = pd.Series(importances, index=features).sort_values(ascending=False)
forest_importances

ram              0.467112
battery_power    0.076026
px_width         0.057708
px_height        0.055458
mobile_wt        0.039427
int_memory       0.037880
talk_time        0.032190
pc               0.030592
sc_w             0.029553
clock_speed      0.028856
sc_h             0.028498
m_dep            0.026579
n_cores          0.024210
fc               0.023903
touch_screen     0.008145
blue             0.007236
dual_sim         0.007207
four_g           0.007192
wifi             0.006383
three_g          0.005847
dtype: float64

After checking the importance of all the features, we will test models without taking some of the features into consideration.

In [12]:
# more than 0.02
columns_to_drop_1 = ['touch_screen', 'blue', 'dual_sim', 'four_g', 'wifi', 'three_g']
X_train_1 = X_train.drop(columns=columns_to_drop_1)
X_val_1 = X_val.drop(columns=columns_to_drop_1)

# more than 0.05
columns_to_drop_2 = ['touch_screen', 'blue', 'dual_sim', 'four_g', 'wifi', 'three_g', 'fc', 'n_cores', 'm_dep', 'sc_h', 'clock_speed', 'sc_w', 'pc', 'talk_time', 'int_memory', 'mobile_wt']
X_train_2 = X_train.drop(columns=columns_to_drop_2)
X_val_2 = X_val.drop(columns=columns_to_drop_2)

# just RAM and battery power
X_train_3 = X_train[['ram', 'battery_power']]
X_val_3 = X_val[['ram', 'battery_power']]

# more than 0.032
columns_to_drop_4 = ['touch_screen', 'blue', 'dual_sim', 'four_g', 'wifi', 'three_g', 'fc', 'n_cores', 'm_dep', 'sc_h', 'clock_speed', 'sc_w', 'pc', 'talk_time']
X_train_4 = X_train.drop(columns=columns_to_drop_4)
X_val_4 = X_val.drop(columns=columns_to_drop_4)

### Standarisation

We'll also analyse some of the models with using 2 types of dataset transformations:
- Standarisation
- Box Cox transformation

In [14]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.fit_transform(X_val)

X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_val_scaled = pd.DataFrame(X_val_scaled, columns=X_val.columns)

## Dummy Clasifier

In [9]:
from sklearn.dummy import DummyClassifier
from sklearn.metrics import classification_report

dc = DummyClassifier(strategy='uniform')
dc.fit(X_train,y_train)
y_pred = dc.predict(X_val)
print('most_frequent:')
print(classification_report(y_val, y_pred))

dc = DummyClassifier(strategy='stratified')
dc.fit(X_train,y_train)
y_pred = dc.predict(X_val)
print('stratified:')
print(classification_report(y_val, y_pred))


most_frequent:
              precision    recall  f1-score   support

           0       0.22      0.21      0.21        73
           1       0.22      0.22      0.22        74
           2       0.20      0.22      0.21        73
           3       0.29      0.28      0.29        74

    accuracy                           0.23       294
   macro avg       0.23      0.23      0.23       294
weighted avg       0.23      0.23      0.23       294

stratified:
              precision    recall  f1-score   support

           0       0.31      0.26      0.28        73
           1       0.33      0.28      0.30        74
           2       0.31      0.33      0.32        73
           3       0.24      0.30      0.27        74

    accuracy                           0.29       294
   macro avg       0.30      0.29      0.29       294
weighted avg       0.30      0.29      0.29       294



As we can observe the accuracy of this model is pretty low, so we decided not to check if any further changes will affect it since we believe we will not be able to achieve satisfactory results.

## Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

# all
model = LogisticRegression(multi_class="ovr")
model.fit(X_train, y_train)
y_pred = dc.predict(X_val)

print('Result for all columns:')
print(classification_report(y_val, y_pred))

# all standarasied
model = LogisticRegression(multi_class="ovr")
model.fit(X_train_scaled, y_train)
y_pred = dc.predict(X_val_scaled)

print('Result for all columns, standaraised:')
print(classification_report(y_val, y_pred))

# over 0.02
model.fit(X_train_1, y_train)
y_pred_1 = dc.predict(X_val_1)

print('Result for features with importance greater than 0.02:')
print(classification_report(y_val, y_pred_1))

# over 0.032
model.fit(X_train_4, y_train)
y_pred_4 = dc.predict(X_val_2)

print('Result for features with importance greater than 0.032:')
print(classification_report(y_val, y_pred_4))

# over 0.05
model.fit(X_train_2, y_train)
y_pred_2 = dc.predict(X_val_2)

print('Result for features with importance greater than 0.05:')
print(classification_report(y_val, y_pred_2))

# just ram and battery
model.fit(X_train_3, y_train)
y_pred_3 = dc.predict(X_val_3)

print('Result just for RAM an battery power:')
print(classification_report(y_val, y_pred_3))

In this case results are also not satisfactory. The best score is obtained when considering all features from the dataset.

# Decision Tree

In [17]:
from sklearn.tree import DecisionTreeClassifier,plot_tree #export_graphviz

# all
tree = DecisionTreeClassifier()
tree.fit(X_train,y_train)
y_pred = tree.predict(X_val)
print('Result for all columns:')
print(classification_report(y_val, y_pred))

# all standarasied
tree.fit(X_train_scaled, y_train)
y_pred_5= tree.predict(X_val_scaled)
print('Result for all columns, standaraised:')
print(classification_report(y_val, y_pred_5))

# over 0.02
tree.fit(X_train_1,y_train)
y_pred_1= tree.predict(X_val_1)
print('Result for features with importance greater than 0.02:')
print(classification_report(y_val, y_pred_1))

# over 0.032
tree.fit(X_train_4,y_train)
y_pred_4= tree.predict(X_val_4)
print('Result for features with importance greater than 0.032:')
print(classification_report(y_val, y_pred_4))

# over 0.05
tree.fit(X_train_2,y_train)
y_pred_2= tree.predict(X_val_2)
print('Result for features with importance greater than 0.05:')
print(classification_report(y_val, y_pred_2))

# just ram and battery power
tree.fit(X_train_3, y_train)
y_pred_3= tree.predict(X_val_3)
print('Result just for RAM an battery power:')
print(classification_report(y_val, y_pred_3))


Result for all columns:
              precision    recall  f1-score   support

           0       0.97      0.89      0.93        73
           1       0.74      0.84      0.78        74
           2       0.76      0.73      0.74        73
           3       0.89      0.88      0.88        74

    accuracy                           0.83       294
   macro avg       0.84      0.83      0.83       294
weighted avg       0.84      0.83      0.83       294

Result for all columns, standaraised:
              precision    recall  f1-score   support

           0       0.88      0.93      0.91        73
           1       0.75      0.72      0.73        74
           2       0.74      0.73      0.73        73
           3       0.88      0.88      0.88        74

    accuracy                           0.81       294
   macro avg       0.81      0.81      0.81       294
weighted avg       0.81      0.81      0.81       294

Result for features with importance greater than 0.02:
             

              precision    recall  f1-score   support

           0       0.97      0.89      0.93        73
           1       0.78      0.86      0.82        74
           2       0.77      0.77      0.77        73
           3       0.90      0.88      0.89        74

    accuracy                           0.85       294
   macro avg       0.86      0.85      0.85       294
weighted avg       0.86      0.85      0.85       294

Result for features with importance greater than 0.05:
              precision    recall  f1-score   support

           0       0.94      0.92      0.93        73
           1       0.81      0.85      0.83        74
           2       0.79      0.78      0.79        73
           3       0.90      0.89      0.90        74

    accuracy                           0.86       294
   macro avg       0.86      0.86      0.86       294
weighted avg       0.86      0.86      0.86       294

Result just for RAM an battery power:
              precision    recall  f1