# 机器学习实验 - 决策树

## Data Exploring

In [3]:
import pandas as pd

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
train_path = 'training.csv'
test_path = 'testing.csv'
valid_path = 'validation.csv'
train_data = pd.read_csv(train_path)
valid_data = pd.read_csv(valid_path)
test_data = pd.read_csv(test_path)

In [7]:
train_data.describe()

Unnamed: 0,recordId,usefulCount,rating
count,6999.0,6999.0,6999.0
mean,117818.948564,28.189027,3.712102
std,66920.098771,40.007725,1.556816
min,6.0,0.0,1.0
25%,59038.0,6.0,2.0
50%,118973.0,16.0,4.0
75%,175369.0,35.0,5.0
max,232143.0,949.0,5.0


In [8]:
train_data.head()

Unnamed: 0,recordId,drugName,condition,reviewComment,date,usefulCount,sideEffects,rating
0,163740,Mirtazapine,Depression,"""I&#039;ve tried a few antidepressants over th...","February 28, 2012",22,Mild Side Effects,5
1,206473,Mesalamine,"Crohn's Disease, Maintenance","""My son has Crohn&#039;s disease and has done ...","May 17, 2009",17,Severe Side Effects,4
2,159672,Bactrim,Urinary Tract Infection,"""Quick reduction of symptoms""","September 29, 2017",3,No Side Effects,5
3,39293,Contrave,Weight Loss,"""Contrave combines drugs that were used for al...","March 5, 2017",35,Mild Side Effects,5
4,97768,Cyclafem 1 / 35,Birth Control,"""I have been on this birth control for one cyc...","October 22, 2015",4,Severe Side Effects,5


In [9]:
train_data['sideEffects'].value_counts()

Mild Side Effects                2319
No Side Effects                  2080
Moderate Side Effects            1376
Severe Side Effects               835
Extremely Severe Side Effects     389
Name: sideEffects, dtype: int64

In [10]:
severeness = {'No Side Effects': 0, 
              'Mild Side Effects': 1, 
              'Moderate Side Effects': 2,
             'Severe Side Effects': 3,
             'Extremely Severe Side Effects': 4
             }

In [11]:
train_data['condition'].value_counts()

Birth Control                           1265
Depression                               427
Pain                                     270
Anxiety                                  248
Acne                                     234
                                        ... 
Condylomata Acuminata                      1
Organ Transplant, Rejection Reversal       1
Otitis Externa                             1
Oral Thrush                                1
Diagnosis and Investigation                1
Name: condition, Length: 408, dtype: int64

In [12]:
train_data['drugName'].value_counts()

Levonorgestrel                        174
Etonogestrel                          145
Ethinyl estradiol / norethindrone     102
Ethinyl estradiol / levonorgestrel     86
Nexplanon                              82
                                     ... 
Patanase                                1
Ibrance                                 1
Levalbuterol                            1
Eptifibatide                            1
Telaprevir                              1
Name: drugName, Length: 1300, dtype: int64

In [13]:
 train_data['date'].value_counts()

April 11, 2017        15
March 31, 2016        12
December 2, 2016      12
February 17, 2016     11
July 23, 2015         11
                      ..
March 25, 2011         1
December 30, 2010      1
October 31, 2008       1
September 13, 2016     1
August 5, 2010         1
Name: date, Length: 2737, dtype: int64

根据领域知识，我认为对于药的评分应该是和名称、评价日期无关的，因此这两个特征直接去除。

## 预处理函数

这里我先根据领域知识去掉了一些列，在剩下的列中，sideEffects 和 reviewComment 都是文本类型，都需要转换成数值化，这里我将副作用按照从轻到重严重程度从 1 排到 5，对于 reviewComment 则用情感分析，映射为 1 和 0，分别对应积极和消极。

In [2]:
!pip install transformers
import transformers
from transformers import pipeline

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.27.4-py3-none-any.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m66.1 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m104.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.3-py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.8/199.8 KB[0m [31m27.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.3 tokenizers-0.13.2 transformers-4.27.4


In [15]:
model = pipeline('sentiment-analysis', device=0)

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


In [17]:
model

<transformers.pipelines.text_classification.TextClassificationPipeline at 0x7f81017dae50>

In [20]:
def classify_sentiment(comment):
  result = model(comment)[0]['label']
  if result == 'POSITIVE':
    return 1
  else:
    return 0

In [19]:
import numpy as np

In [44]:
def severeness_encoding(s):
  s = severeness[s]
  return s

## 对训练数据进行预处理

In [23]:
# reviewComment, usefulCount, sideEffects
features = ['reviewComment', 'usefulCount', 'sideEffects']

In [41]:
X_train = train_data[features]
y_train = train_data['rating']

In [26]:
def truncate(text):
  if len(text) > 512:
    text = text[0:512]
  return text

In [42]:
X_train['reviewComment'] = X_train['reviewComment'].apply(lambda x : classify_sentiment(truncate(x)))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train['reviewComment'] = X_train['reviewComment'].apply(lambda x : classify_sentiment(truncate(x)))


In [45]:
X_train

Unnamed: 0,reviewComment,usefulCount,sideEffects
0,0,22,Mild Side Effects
1,1,17,Severe Side Effects
2,0,3,No Side Effects
3,0,35,Mild Side Effects
4,1,4,Severe Side Effects
...,...,...,...
6994,1,11,Mild Side Effects
6995,0,0,No Side Effects
6996,0,79,Mild Side Effects
6997,0,1,No Side Effects


In [46]:
X_train['sideEffects'] = X_train['sideEffects'].apply(severeness_encoding)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train['sideEffects'] = X_train['sideEffects'].apply(severeness_encoding)


In [47]:
X_train['sideEffects']

0       1
1       3
2       0
3       1
4       3
       ..
6994    1
6995    0
6996    1
6997    0
6998    0
Name: sideEffects, Length: 6999, dtype: int64

In [48]:
y_train

0       5
1       4
2       5
3       5
4       5
       ..
6994    5
6995    4
6996    5
6997    1
6998    3
Name: rating, Length: 6999, dtype: int64

In [49]:
X_train.head()

Unnamed: 0,reviewComment,usefulCount,sideEffects
0,0,22,1
1,1,17,3
2,0,3,0
3,0,35,1
4,1,4,3


In [50]:
y_train

0       5
1       4
2       5
3       5
4       5
       ..
6994    5
6995    4
6996    5
6997    1
6998    3
Name: rating, Length: 6999, dtype: int64

In [53]:
# Define a function to calculate the Gini impurity
def gini_impurity(labels):
    # When the set is empty, return 0
    if labels.size == 0:
        return 0
    
    # Count the occurrences of each label
    counts = np.unique(labels, return_counts=True)[1]
    
    # Calculate the probabilities for each label
    fractions = counts / float(len(labels))
    
    # Return the Gini impurity
    return 1 - np.sum(fractions ** 2)

# Define a function to find the best split for a given feature
def find_best_split(feature, labels):
    # Initialize the best split
    best_split = {}
    
    # Find the unique values in the feature
    unique_values = np.unique(feature)
    
    # Iterate over all possible split values
    for split_value in unique_values:
        # Split the data into two groups based on the split value
        left_labels = labels[feature < split_value]
        right_labels = labels[feature >= split_value]
        
        # Calculate the Gini impurity for each group
        left_gini = gini_impurity(left_labels)
        right_gini = gini_impurity(right_labels)
        
        # Calculate the weighted average Gini impurity for this split
        n = len(labels)
        n_left = len(left_labels)
        n_right = len(right_labels)
        gini = (n_left / n) * left_gini + (n_right / n) * right_gini
        
        # Update the best split if this split is better
        if 'gini' not in best_split or gini < best_split['gini']:
            best_split = {
                'feature': feature.name,
                'split_value': split_value,
                'gini': gini,
                'left_labels': left_labels,
                'right_labels': right_labels,
            }
    
    # Return the best split
    return best_split

# Define a function to build a decision tree recursively
def build_tree(data, labels, depth=0, max_depth=3):
    # Create a leaf node with the most common label if we have reached the maximum depth
    if depth == max_depth:
        return {
            'leaf': True,
            'label': np.argmax(np.bincount(labels)),
        }
    
    # Find the best split for each feature
    splits = [find_best_split(data[feature], labels) for feature in data.columns]
    
    # Find the best split overall
    best_split = min(splits, key=lambda x: x['gini'])
    
    # Create an internal node with the best split
    node = {
        'leaf': False,
        'feature': best_split['feature'],
        'split_value': best_split['split_value'],
        'left_child': build_tree(data[data[best_split['feature']] < best_split['split_value']], best_split['left_labels'], depth + 1, max_depth),
        'right_child': build_tree(data[data[best_split['feature']] >= best_split['split_value']], best_split['right_labels'], depth + 1, max_depth),
    }
    
    return node

# Define a function to make predictions using a decision tree
def predict(tree, data):
    # If we have reached a leaf node, return its label
    if tree['leaf']:
        return tree['label']
    
    # Otherwise, move to the left or right child depending on the value of the feature in this data point
    if data[tree['feature']] < tree['split_value']:
        return predict(tree['left_child'], data)
    else:
        return predict(tree['right_child'], data)

In [60]:
tree = build_tree(X_train, y_train)

In [66]:
predictions_train = [predict(tree, X_train.iloc[i]) for i in range(len(X_train))]

In [68]:
train_acc = sum(predictions_train == y_train) / len(y_train)
train_acc

0.5076439491355907

## 对验证集数据进行预处理

In [63]:
X_valid = valid_data[features]
y_valid = valid_data['rating']

In [64]:
X_valid['sideEffects'] = X_valid['sideEffects'].apply(severeness_encoding)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_valid['sideEffects'] = X_valid['sideEffects'].apply(severeness_encoding)


In [65]:
X_valid['reviewComment'] = X_valid['reviewComment'].apply(lambda x : classify_sentiment(truncate(x)))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_valid['reviewComment'] = X_valid['reviewComment'].apply(lambda x : classify_sentiment(truncate(x)))


In [69]:
predictions_valid = [predict(tree, X_valid.iloc[i]) for i in range(len(X_valid))]

## 计算评估分数

In [83]:
from sklearn.metrics import f1_score

In [84]:
f1_micro = f1_score(y_valid, predictions_valid, average='micro')

In [98]:
print(f'Micro-F1: {f1_micro}')

Micro-F1: 0.5187656380316931


In [86]:
f1_macro = f1_score(y_valid, predictions_valid, average='macro')

In [99]:
print(f'Macro-F1: {f1_macro}')

Macro-F1: 0.2231501182877081


In [70]:
valid_acc = sum(predictions_valid == y_valid) / len(y_valid)

In [71]:
valid_acc

0.5187656380316931

## 下面对测试数据进行相同的预处理

In [74]:
X_test = test_data[features]

In [75]:
X_test.head()

Unnamed: 0,reviewComment,usefulCount,sideEffects
0,"""I was on Microgestin Fe 1/20 for three months...",1,Severe Side Effects
1,"""Have had clusters for 45 years, remember the ...",0,Moderate Side Effects
2,"""On June 6th 2015 I took plan B 4 HOURS after ...",13,Mild Side Effects
3,"""I tried Chantix for a period of 2 months. My...",7,Mild Side Effects
4,"""I was diagnosed with Sleep Apnea, for that I ...",11,Moderate Side Effects


In [77]:
X_test['sideEffects'] = X_test['sideEffects'].apply(severeness_encoding)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test['sideEffects'] = X_test['sideEffects'].apply(severeness_encoding)


In [78]:
X_test['sideEffects']

0       3
1       2
2       1
3       1
4       2
       ..
1793    2
1794    3
1795    4
1796    1
1797    2
Name: sideEffects, Length: 1798, dtype: int64

In [79]:
X_test['reviewComment'] = X_test['reviewComment'].apply(lambda x : classify_sentiment(truncate(x)))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test['reviewComment'] = X_test['reviewComment'].apply(lambda x : classify_sentiment(truncate(x)))


In [80]:
predictions_test = [predict(tree, X_test.iloc[i]) for i in range(len(X_test))]

In [88]:
y_test = pd.DataFrame(data=predictions_test)

In [91]:
test_data['rating'] = y_test

In [92]:
test_data

Unnamed: 0,recordId,drugName,condition,reviewComment,date,usefulCount,sideEffects,rating
0,219597,Microgestin Fe 1.5 / 30,Birth Control,"""I was on Microgestin Fe 1/20 for three months...",10-Sep-17,1,Severe Side Effects,1
1,134044,Prednisone,Cluster Headaches,"""Have had clusters for 45 years, remember the ...",29-Aug-17,0,Moderate Side Effects,1
2,68176,Plan B,Emergency Contraception,"""On June 6th 2015 I took plan B 4 HOURS after ...",6-Jul-15,13,Mild Side Effects,5
3,200538,Varenicline,Smoking Cessation,"""I tried Chantix for a period of 2 months. My...",7-Jul-11,7,Mild Side Effects,1
4,46409,Modafinil,Narcolepsy,"""I was diagnosed with Sleep Apnea, for that I ...",1-Oct-09,11,Moderate Side Effects,5
...,...,...,...,...,...,...,...,...
1793,132278,Ativan,Insomnia,"""I was given Ativan for occasional use. I try ...",9-Aug-14,176,Moderate Side Effects,5
1794,126842,Erythromycin,Upper Respiratory Tract Infection,"""This antibiotic is the best, hands down. The...",26-Mar-16,8,Severe Side Effects,1
1795,68153,Plan B,Emergency Contraception,"""I took this pill on June 16th 2015. I was rea...",12-Aug-15,2,Extremely Severe Side Effects,5
1796,126865,Dilaudid,Pain,"""I developed severe pancreatitis and hepatitis...",2-Feb-10,58,Mild Side Effects,5


In [96]:
test_data.to_csv('result.csv', index=False)

In [94]:
from google.colab import files

In [97]:
files.download('result.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>