ОПИСАНИЕ ДАТАСЕТА

https://archive.ics.uci.edu/ml/machine-learning-databases/car/

| names file (C4.5 format) for car evaluation domain

| class values

unacc, acc, good, vgood

| attributes

buying:   vhigh, high, med, low.<br>
maint:    vhigh, high, med, low.<br>
doors:    2, 3, 4, 5more.<br>
persons:  2, 4, more.<br>
lug_boot: small, med, big.<br>
safety:   low, med, high.

In [1]:
import pandas as pd
import numpy as np

names=['buying','maint','doors','persons','lug_boot','safety','class']
data = pd.read_csv("car.data", header=None, names = names)

data.head(3)

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc


In [2]:
print(data.shape)

(1728, 7)


In [3]:
data.dtypes

buying      object
maint       object
doors       object
persons     object
lug_boot    object
safety      object
class       object
dtype: object

Так как целевая переменная не бинарнарная объединим классы unacc, acc отметив их как плохие и good, vgood, как хорошие = 1

In [4]:
data['class'] = data['class'].apply(lambda x: 0 if x == 'unacc' or x == 'acc' else 1)
data['class'].value_counts()

0    1594
1     134
Name: class, dtype: int64

In [5]:
data.persons.astype(str)
data['persons'] = data['persons'].apply(lambda x: 8 if x == 'more' else x)
data['persons'] = pd.to_numeric(data['persons'])
data['persons'].value_counts()

8    576
4    576
2    576
Name: persons, dtype: int64

In [6]:
data.doors.astype(str)
data['doors'] = data['doors'].apply(lambda x: 5 if x == '5more' else x)
data['doors'] = pd.to_numeric(data['doors'])
data['doors'].value_counts()

5    432
4    432
3    432
2    432
Name: doors, dtype: int64

In [7]:
data['buying'] = data['buying'].apply(lambda x: 3 if x == 'vhigh' else x)
data['buying'] = data['buying'].apply(lambda x: 2 if x == 'high' else x)
data['buying'] = data['buying'].apply(lambda x: 1 if x == 'med' else 0)
data['buying'].value_counts()

0    1296
1     432
Name: buying, dtype: int64

In [8]:
data['maint'] = data['maint'].apply(lambda x: 3 if x == 'vhigh' else x)
data['maint'] = data['maint'].apply(lambda x: 2 if x == 'high' else x)
data['maint'] = data['maint'].apply(lambda x: 1 if x == 'med' else 0)

In [9]:
data['lug_boot'] = data['lug_boot'].apply(lambda x: 2 if x == 'big' else x)
data['lug_boot'] = data['lug_boot'].apply(lambda x: 1 if x == 'med' else 0)

data['safety'] = data['safety'].apply(lambda x: 2 if x == 'high' else x)
data['safety'] = data['safety'].apply(lambda x: 1 if x == 'med' else 0)

data.head(3)

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class
0,0,0,2,2,0,0,0
1,0,0,2,2,0,1,0
2,0,0,2,2,0,0,0


In [10]:
from sklearn.model_selection import train_test_split

x_data = data.iloc[:,:-1]
y_data = data.iloc[:,-1]

x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.2, random_state=7)

In [11]:
import xgboost as xgb

model = xgb.XGBClassifier()

model.fit(x_train, y_train)
y_predict = model.predict(x_test)

In [12]:
data.dtypes

buying      int64
maint       int64
doors       int64
persons     int64
lug_boot    int64
safety      int64
class       int64
dtype: object

In [13]:
from sklearn.metrics import recall_score, precision_score, roc_auc_score, accuracy_score, f1_score

df_result = pd.DataFrame({'type': [], 'roc_auc': [], 'precision': [], 'recall': [], 'f-score': [], })

def evaluate_results(y_test, y_predict, df_result, type_of):
    print('Classification results:')
    f1 = f1_score(y_test, y_predict)
    print("f1: %.2f%%" % (f1 * 100.0)) 
    roc = roc_auc_score(y_test, y_predict)
    print("roc: %.2f%%" % (roc * 100.0)) 
    rec = recall_score(y_test, y_predict, average='binary')
    print("recall: %.2f%%" % (rec * 100.0)) 
    prc = precision_score(y_test, y_predict, average='binary')
    print("precision: %.2f%%" % (prc * 100.0)) 

    df_result.loc[len(df_result)+1] = [type_of, roc * 100.0, prc * 100.0, rec * 100.0, f1 * 100.0]
    
evaluate_results(y_test, y_predict, df_result, 'NaN')

Classification results:
f1: 0.00%
roc: 49.85%
recall: 0.00%
precision: 0.00%


PU learning

In [14]:
#25%

mod_data = data.copy()
#get the indices of the positives samples
pos_ind = np.where(mod_data.iloc[:,-1].values == 1)[0]
#shuffle them
np.random.shuffle(pos_ind)
# leave just 25% of the positives marked
pos_sample_len = int(np.ceil(0.25 * len(pos_ind)))
print(f'Using {pos_sample_len}/{len(pos_ind)} as positives and unlabeling the rest')
pos_sample = pos_ind[:pos_sample_len]

Using 34/134 as positives and unlabeling the rest


In [15]:
mod_data['class_test'] = -1
mod_data.loc[pos_sample,'class_test'] = 1
print('target variable:\n', mod_data.iloc[:,-1].value_counts())

target variable:
 -1    1694
 1      34
Name: class_test, dtype: int64


In [16]:
mod_data.head(10)

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class,class_test
0,0,0,2,2,0,0,0,-1
1,0,0,2,2,0,1,0,-1
2,0,0,2,2,0,0,0,-1
3,0,0,2,2,1,0,0,-1
4,0,0,2,2,1,1,0,-1
5,0,0,2,2,1,0,0,-1
6,0,0,2,2,0,0,0,-1
7,0,0,2,2,0,1,0,-1
8,0,0,2,2,0,0,0,-1
9,0,0,2,4,0,0,0,-1


In [17]:
x_data = mod_data.iloc[:,:-2].values # just the X 
y_labeled = mod_data.iloc[:,-1].values # new class (just the P & U)
y_positive = mod_data.iloc[:,-2].values # original class

random negative sampling

In [18]:
mod_data = mod_data.sample(frac=1)
neg_sample = mod_data[mod_data['class_test']==-1][:len(mod_data[mod_data['class_test']==1])]
sample_test = mod_data[mod_data['class_test']==-1][len(mod_data[mod_data['class_test']==1]):]
pos_sample = mod_data[mod_data['class_test']==1]
print(neg_sample.shape, pos_sample.shape)
sample_train = pd.concat([neg_sample, pos_sample]).sample(frac=1)

(34, 8) (34, 8)


In [19]:
model = xgb.XGBClassifier()

model.fit(sample_train.iloc[:,:-2].values, 
          sample_train.iloc[:,-2].values)
y_predict = model.predict(sample_test.iloc[:,:-2].values)
evaluate_results(sample_test.iloc[:,-2].values, y_predict, df_result, '25%')

Classification results:
f1: 15.46%
roc: 60.90%
recall: 63.64%
precision: 8.80%


In [20]:
#40%

mod_data = data.copy()
#get the indices of the positives samples
pos_ind = np.where(mod_data.iloc[:,-1].values == 1)[0]
#shuffle them
np.random.shuffle(pos_ind)
# leave just 40% of the positives marked
pos_sample_len = int(np.ceil(0.4 * len(pos_ind)))
print(f'Using {pos_sample_len}/{len(pos_ind)} as positives and unlabeling the rest')
pos_sample = pos_ind[:pos_sample_len]

Using 54/134 as positives and unlabeling the rest


In [21]:
mod_data['class_test'] = -1
mod_data.loc[pos_sample,'class_test'] = 1
print('target variable:\n', mod_data.iloc[:,-1].value_counts())

target variable:
 -1    1674
 1      54
Name: class_test, dtype: int64


In [22]:
x_data = mod_data.iloc[:,:-2].values # just the X 
y_labeled = mod_data.iloc[:,-1].values # new class (just the P & U)
y_positive = mod_data.iloc[:,-2].values # original class

In [23]:
mod_data = mod_data.sample(frac=1)
neg_sample = mod_data[mod_data['class_test']==-1][:len(mod_data[mod_data['class_test']==1])]
sample_test = mod_data[mod_data['class_test']==-1][len(mod_data[mod_data['class_test']==1]):]
pos_sample = mod_data[mod_data['class_test']==1]
print(neg_sample.shape, pos_sample.shape)
sample_train = pd.concat([neg_sample, pos_sample]).sample(frac=1)

(54, 8) (54, 8)


In [24]:
model = xgb.XGBClassifier()

model.fit(sample_train.iloc[:,:-2].values, 
          sample_train.iloc[:,-2].values)
y_predict = model.predict(sample_test.iloc[:,:-2].values)
evaluate_results(sample_test.iloc[:,-2].values, y_predict, df_result, '40%')

Classification results:
f1: 11.16%
roc: 56.73%
recall: 54.55%
precision: 6.21%


In [25]:
#70%

mod_data = data.copy()
#get the indices of the positives samples
pos_ind = np.where(mod_data.iloc[:,-1].values == 1)[0]
#shuffle them
np.random.shuffle(pos_ind)
# leave just 70% of the positives marked
pos_sample_len = int(np.ceil(0.7 * len(pos_ind)))
print(f'Using {pos_sample_len}/{len(pos_ind)} as positives and unlabeling the rest')
pos_sample = pos_ind[:pos_sample_len]

Using 94/134 as positives and unlabeling the rest


In [26]:
mod_data['class_test'] = -1
mod_data.loc[pos_sample,'class_test'] = 1
print('target variable:\n', mod_data.iloc[:,-1].value_counts())

target variable:
 -1    1634
 1      94
Name: class_test, dtype: int64


In [27]:
x_data = mod_data.iloc[:,:-2].values # just the X 
y_labeled = mod_data.iloc[:,-1].values # new class (just the P & U)
y_positive = mod_data.iloc[:,-2].values # original class

In [28]:
mod_data = mod_data.sample(frac=1)
neg_sample = mod_data[mod_data['class_test']==-1][:len(mod_data[mod_data['class_test']==1])]
sample_test = mod_data[mod_data['class_test']==-1][len(mod_data[mod_data['class_test']==1]):]
pos_sample = mod_data[mod_data['class_test']==1]
print(neg_sample.shape, pos_sample.shape)
sample_train = pd.concat([neg_sample, pos_sample]).sample(frac=1)

(94, 8) (94, 8)


In [29]:
model = xgb.XGBClassifier()

model.fit(sample_train.iloc[:,:-2].values, 
          sample_train.iloc[:,-2].values)
y_predict = model.predict(sample_test.iloc[:,:-2].values)
evaluate_results(sample_test.iloc[:,-2].values, y_predict, df_result, '70%')

Classification results:
f1: 7.14%
roc: 64.37%
recall: 78.38%
precision: 3.74%


In [30]:
#90%

mod_data = data.copy()
#get the indices of the positives samples
pos_ind = np.where(mod_data.iloc[:,-1].values == 1)[0]
#shuffle them
np.random.shuffle(pos_ind)
# leave just 90% of the positives marked
pos_sample_len = int(np.ceil(0.9 * len(pos_ind)))
print(f'Using {pos_sample_len}/{len(pos_ind)} as positives and unlabeling the rest')
pos_sample = pos_ind[:pos_sample_len]

Using 121/134 as positives and unlabeling the rest


In [31]:
mod_data['class_test'] = -1
mod_data.loc[pos_sample,'class_test'] = 1
print('target variable:\n', mod_data.iloc[:,-1].value_counts())

target variable:
 -1    1607
 1     121
Name: class_test, dtype: int64


In [32]:
x_data = mod_data.iloc[:,:-2].values # just the X 
y_labeled = mod_data.iloc[:,-1].values # new class (just the P & U)
y_positive = mod_data.iloc[:,-2].values # original class

In [33]:
mod_data = mod_data.sample(frac=1)
neg_sample = mod_data[mod_data['class_test']==-1][:len(mod_data[mod_data['class_test']==1])]
sample_test = mod_data[mod_data['class_test']==-1][len(mod_data[mod_data['class_test']==1]):]
pos_sample = mod_data[mod_data['class_test']==1]
print(neg_sample.shape, pos_sample.shape)
sample_train = pd.concat([neg_sample, pos_sample]).sample(frac=1)

(121, 8) (121, 8)


In [34]:
model = xgb.XGBClassifier()

model.fit(sample_train.iloc[:,:-2].values, 
          sample_train.iloc[:,-2].values)
y_predict = model.predict(sample_test.iloc[:,:-2].values)
evaluate_results(sample_test.iloc[:,-2].values, y_predict, df_result, '90%')

Classification results:
f1: 2.42%
roc: 64.63%
recall: 72.73%
precision: 1.23%


In [35]:
df_result

Unnamed: 0,type,roc_auc,precision,recall,f-score
1,,49.845201,0.0,0.0,0.0
2,25%,60.902102,8.798883,63.636364,15.460123
3,40%,56.728333,6.213018,54.545455,11.155378
4,70%,64.372157,3.741935,78.378378,7.142857
5,90%,64.634823,1.232666,72.727273,2.424242


По итогам эксперимента явно видно, что при небольшом проценте позитивных маркеров выше precision и вцелом f-score. При большом проценте вырос recall. Сам дата сет очень не сбалансированный так как процент TP очень мал, скорее всего из-за этого можно видеть большие искажения в метриках. Выбран был специально, что бы поэксперементировать на несбалансированных данных.