Tasks:

- Convert from multicalss to binary classification target
- Introduce outliers
- Create an imbalance of classes in target
- Introduce null values
- Introduce negative values
- Rename "train_mobil_data" file to "mobile_data"



In [33]:
# Import pandas
import pandas as pd

# Load 'train' dataset
df_train=pd.read_csv('./train_mobil_data.csv')

# Load 'test' dataset
df_test=pd.read_csv('./test_mobile_data.csv')



In [34]:
# Inspect data
df_train.head()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,842,0,2.2,0,1,0,7,0.6,188,2,...,20,756,2549,9,7,19,0,0,1,1
1,1021,1,0.5,1,0,1,53,0.7,136,3,...,905,1988,2631,17,3,7,1,1,0,2
2,563,1,0.5,1,2,1,41,0.9,145,5,...,1263,1716,2603,11,2,9,1,1,0,2
3,615,1,2.5,0,0,0,10,0.8,131,6,...,1216,1786,2769,16,8,11,1,0,0,2
4,1821,1,1.2,0,13,1,44,0.6,141,2,...,1208,1212,1411,8,2,15,1,1,0,1


In [35]:
# Inspect data
df_test.head()

Unnamed: 0,id,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,...,pc,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi
0,1,1043,1,1.8,1,14,0,5,0.1,193,...,16,226,1412,3476,12,7,2,0,1,0
1,2,841,1,0.5,1,4,1,61,0.8,191,...,12,746,857,3895,6,0,7,1,0,0
2,3,1807,1,2.8,0,1,0,27,0.9,186,...,4,1270,1366,2396,17,10,10,0,1,1
3,4,1546,0,0.5,1,18,1,25,0.5,96,...,20,295,1752,3893,10,0,7,1,1,0
4,5,1434,0,1.4,0,11,1,49,0.5,108,...,18,749,810,1773,15,8,7,1,0,1


In [36]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             1000 non-null   int64  
 1   battery_power  1000 non-null   int64  
 2   blue           1000 non-null   int64  
 3   clock_speed    1000 non-null   float64
 4   dual_sim       1000 non-null   int64  
 5   fc             1000 non-null   int64  
 6   four_g         1000 non-null   int64  
 7   int_memory     1000 non-null   int64  
 8   m_dep          1000 non-null   float64
 9   mobile_wt      1000 non-null   int64  
 10  n_cores        1000 non-null   int64  
 11  pc             1000 non-null   int64  
 12  px_height      1000 non-null   int64  
 13  px_width       1000 non-null   int64  
 14  ram            1000 non-null   int64  
 15  sc_h           1000 non-null   int64  
 16  sc_w           1000 non-null   int64  
 17  talk_time      1000 non-null   int64  
 18  three_g  

In [37]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   battery_power  2000 non-null   int64  
 1   blue           2000 non-null   int64  
 2   clock_speed    2000 non-null   float64
 3   dual_sim       2000 non-null   int64  
 4   fc             2000 non-null   int64  
 5   four_g         2000 non-null   int64  
 6   int_memory     2000 non-null   int64  
 7   m_dep          2000 non-null   float64
 8   mobile_wt      2000 non-null   int64  
 9   n_cores        2000 non-null   int64  
 10  pc             2000 non-null   int64  
 11  px_height      2000 non-null   int64  
 12  px_width       2000 non-null   int64  
 13  ram            2000 non-null   int64  
 14  sc_h           2000 non-null   int64  
 15  sc_w           2000 non-null   int64  
 16  talk_time      2000 non-null   int64  
 17  three_g        2000 non-null   int64  
 18  touch_sc

In [38]:
df_test.shape

(1000, 21)

In [39]:
df_train.shape

(2000, 21)



------------------
## NOTE:  ***Test dataset does NOT have a price_range column, should not be combined to the training***
-------------------

# Reducing the Number of Classes from 4 to 2
- We are going to make the project a binary classification problem because students were not introduced to multiclass classification

In [40]:
# looking at the class count on the target variable of our data
df_train.value_counts('price_range')

price_range
0    500
1    500
2    500
3    500
dtype: int64

In [41]:
# Modify target from multiple classes to only 2 classes
df_train.loc[((df_train.price_range == 0) | (df_train.price_range == 1)), "price_range"] = 0
df_train.loc[((df_train.price_range == 2) | (df_train.price_range == 3)), "price_range"] = 1

In [42]:
df_train.value_counts('price_range')

price_range
0    1000
1    1000
dtype: int64

# Introducing Ouliers (weight, talk time)
- Introducing outliers to the data, to test the students on the data preparation section of the project

In [43]:
df_train.describe()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
count,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,...,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0
mean,1238.5185,0.495,1.52225,0.5095,4.3095,0.5215,32.0465,0.50175,140.249,4.5205,...,645.108,1251.5155,2124.213,12.3065,5.767,11.011,0.7615,0.503,0.507,0.5
std,439.418206,0.5001,0.816004,0.500035,4.341444,0.499662,18.145715,0.288416,35.399655,2.287837,...,443.780811,432.199447,1084.732044,4.213245,4.356398,5.463955,0.426273,0.500116,0.500076,0.500125
min,501.0,0.0,0.5,0.0,0.0,0.0,2.0,0.1,80.0,1.0,...,0.0,500.0,256.0,5.0,0.0,2.0,0.0,0.0,0.0,0.0
25%,851.75,0.0,0.7,0.0,1.0,0.0,16.0,0.2,109.0,3.0,...,282.75,874.75,1207.5,9.0,2.0,6.0,1.0,0.0,0.0,0.0
50%,1226.0,0.0,1.5,1.0,3.0,1.0,32.0,0.5,141.0,4.0,...,564.0,1247.0,2146.5,12.0,5.0,11.0,1.0,1.0,1.0,0.5
75%,1615.25,1.0,2.2,1.0,7.0,1.0,48.0,0.8,170.0,7.0,...,947.25,1633.0,3064.5,16.0,9.0,16.0,1.0,1.0,1.0,1.0
max,1998.0,1.0,3.0,1.0,19.0,1.0,64.0,1.0,200.0,8.0,...,1960.0,1998.0,3998.0,19.0,18.0,20.0,1.0,1.0,1.0,1.0


In [44]:
df_train.value_counts('mobile_wt').sort_values()

mobile_wt
140     9
120     9
149     9
96      9
116    10
       ..
146    26
199    26
185    27
101    27
182    28
Length: 121, dtype: int64

In [45]:
mobile_wt_200 = df_train[df_train['mobile_wt']==188]
mobile_wt_200

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,842,0,2.2,0,1,0,7,0.6,188,2,...,20,756,2549,9,7,19,0,0,1,0
75,819,0,0.6,1,8,1,42,0.9,188,6,...,339,1242,1814,5,4,13,1,0,1,0
607,922,0,1.5,1,1,1,35,0.9,188,7,...,1016,1383,1165,19,6,4,1,1,1,0
789,1614,0,1.9,0,8,1,55,0.3,188,3,...,860,1330,3914,17,13,13,1,0,0,1
819,1236,0,0.9,1,2,1,57,0.1,188,1,...,517,809,1406,14,12,20,1,0,1,0
996,774,0,0.5,1,2,1,10,0.5,188,2,...,1480,1731,2944,8,6,2,1,1,1,1
999,1777,1,3.0,0,3,0,20,0.6,188,6,...,511,616,3868,5,1,7,0,1,1,1
1112,586,0,0.5,1,1,0,57,0.3,188,2,...,1226,1389,3646,15,4,11,1,1,1,1
1188,732,1,2.3,1,7,1,39,0.6,188,7,...,1163,1554,3681,19,3,15,1,1,1,1
1365,1318,0,1.9,1,0,1,11,0.8,188,2,...,688,1591,1780,12,10,2,1,0,0,0


In [46]:
df_train.loc[((df_train.mobile_wt == 188) & (df_train.battery_power == 819)), "mobile_wt"] = 300

In [48]:
mobile_wt_outlier = df_train[df_train['mobile_wt']>200]
mobile_wt_outlier

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
75,819,0,0.6,1,8,1,42,0.9,300,6,...,339,1242,1814,5,4,13,1,0,1,0
