In [1]:
import pandas as pd
import numpy as np
import math
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

In [2]:
class Tree:
    def __init__(self):
        self.threshold = None
        self.feature = None
        self.left = None
        self.right = None
        self.predict = None
        
    def set_Node(self, f, t):
        self.feature = f
        self.threshold = t
        
    def adopt_child(self, n, choose):
        if choose == 0:
            self.left = n
        elif choose == 1:
            self.right = n
            
    def set_predict(self, p):
        self.predict = p

In [3]:
def normal(df):
    features = df.columns
    
    for feature in features:
        if feature == 'class':
            continue
        maximal = df[feature].max(axis=0)
        minimal = df[feature].min(axis=0)
        for i in range(len(df[feature])):
            if not df[feature][i]:
                continue
            else:
                df[feature][i] = (df[feature][i] - minimal) / (maximal - minimal)
    return df

def row_distance(a, b, df):
    distance = 0
    features = df.columns
    for feature in features:
        if feature == 'class':
            continue
        elif df[feature][b] == -1 or df[feature][a] == -1:
            distance = 100000
        else:
            distance += (df[feature][a] - df[feature][b])**2  
    return math.sqrt(distance)

def Get_Neighbors(df, a):
    distance = []
    data = []
    for i in range(len(df.index)):
        dist = row_distance(i, a, df)
        distance.append(dist)
        data.append(i)
    distance = np.array(distance)
    data = np.array(data)
    index_dist = distance.argsort()
    data = data[index_dist]
    l = math.floor(len(df.index) * 0.05);
    neighbors = data[:l]
    return neighbors
                            

def distance(df):
    features = df.columns
    df = df.replace(np.nan, -1)
    for feature in features:
        for i in range(len(df.index)):
            if df[feature][i] == -1:
                n = Get_Neighbors(df, i)
                df[feature][i] = 0.0
                #print(len(n))
                for j in n:
                    df[feature][i] += 1 / len(n) * df[feature][j]
    return df

In [4]:
def fill_missing_value_mean(df):
    features = df.columns
    
    for feature in features:
        mean = df_x[feature].mean()
        df[feature] = df[feature].fillna(mean)
        
    return df

def remove_missing_value(df):
    return df.dropna()

In [5]:
def Entropy(df):
    result = df["class"].value_counts(normalize=True)
    e = 0.0
    
    for i in result:
        e -= i * math.log(i, 2)
        
    return e

def IG(df, df1, df2):
    p1 = df1["class"].count()
    p = df["class"].count()
    gain = Entropy(df) - ((p1 / p) * Entropy(df1)) - ((1 - p1 / p) * Entropy(df2))    
    
    return gain

def max_feature_gain(df, feature):
    maximal = -1
    threshold = 0
    
    for i in set(df[feature]):
        gain = IG(df, df[df[feature] <= i], df[df[feature] > i])
        if gain > maximal:
            maximal = gain
            threshold = i
            
    return maximal, threshold

In [6]:
def random_data(df, seed):
    return df.sample(frac = 0.2, random_state = seed)

def random_feature(features):
    length = math.floor(len(features) * 0.5)
    return np.random.choice(features, length, False)

In [7]:
def construct_tree(data, features):
    feature = ""
    threshold = 0
    maximal = -1
    
    for i in features:
        val, standard = max_feature_gain(data, i)
        if val > maximal:
            maximal = val
            threshold = standard
            feature = i
    
    node = Tree()
    node.set_Node(feature, threshold)
    if maximal > 0:
        node.adopt_child(construct_tree(data[data[feature] <= threshold], features), 0)
        node.adopt_child(construct_tree(data[data[feature] >  threshold], features), 1)
        
    if node.left is None and node.right is None:
        predicts = data["class"].value_counts()
        label_num = 0
        label = 0
        for k, j in predicts.items():
            if j > label_num:
                label_num = j
                label = k
        node.set_predict(label)
        
    return node

In [8]:
def report(predictions, y_test):
    print('Accuracy: %s' % accuracy_score(y_test, predictions))
    print('Confusion Matrix:')
    print(confusion_matrix(y_test, predictions))
    print('Classification Report:')
    print(classification_report(y_test, predictions))

In [9]:
def apply(row, node):
    if node.predict is not None:
        return node.predict
    elif row[node.feature] <= node.threshold:
        return apply(row, node.left)
    else:
        return apply(row, node.right)

def fit(df, forest):
    correct = 0
    count = 0
    predict_list = []
    for i, row in df.iterrows():
        count += 1
        ans = []
        for j in forest:
            re = apply(row, j)
            ans.append(re)
        vote = max(set(ans), key = ans.count)
        predict_list.append(vote)
    report(predict_list, df['class'].tolist())
        #if vote == df["class"][i]:
            #correct += 1
    #print('fit: ', correct / count)
    return correct / count

def cv(df, root):
    correct = 0
    count = 0
    for i, row in df.iterrows():
        count += 1
        ans = []
        re = apply(row, root)
        if re == df["class"][i]:
            correct += 1
    #print('cv: ', correct / count)
    return correct / count

In [10]:
def Decision_Tree(train, test, features):
    #train = data.sample(frac = 0.7, random_state = 1)
    #test = data.drop(train.index)
        
    features_valid = []
    for j in features:
        if j == 'class':
            continue
        features_valid.append(j)
    forest = []
    
    count = 0
    while len(forest) < 10:
        df = random_data(train, count)
        feature_set = random_feature(features_valid)
        root = construct_tree(df, feature_set)
        if cv(test, root) > 0.45:
            forest.append(root)
        count += 1
        
    acc_test = fit(test, forest)
    return forest

In [14]:
df_x = pd.read_excel("Dataset1/train/X_train.xlsx")
df_y = pd.read_excel("Dataset1/train/y_train.xlsx")
#df_x = fill_missing_value_mean(df_x)
df_x = normal(df_x)
df_x = distance(df_x)
data = df_x.join(df_y)
print(data)

      fixed_acidity  volatile_acidity  citric_acid  residual_sugar  chlorides  \
0          0.218182          0.090909         0.40        0.028169   0.085284   
1          0.290909          0.396694         0.26        0.056338   0.113712   
2          0.463636          0.471074         0.32        0.091549   0.127090   
3          0.672727          0.214876         0.56        0.063380   0.135452   
4          0.163636          0.429752         0.21        0.042254   0.115385   
...             ...               ...          ...             ...        ...   
1018       0.218182          0.347107         0.09        0.056338   0.115385   
1019       0.509091          0.350754         0.48        0.093296   0.066890   
1020       0.290909          0.338843         0.04        0.035211   0.107023   
1021       0.245455          0.256198         0.24        0.091549   0.110368   
1022       0.409091          0.314050         0.30        0.049296   0.088629   

      free_sulfur_dioxide  

In [15]:
size = math.floor(data['class'].count() / 10)
fold = []
for i in range(10):
    df = data.iloc[:size, :]
    fold.append(df)
    data = data.iloc[size:, :]
    
for i in range(len(fold)):
    train = pd.DataFrame()
    for j in range(len(fold)):
        if j != i:
            train = train.append(fold[j])
    test = fold[i]
    forest = Decision_Tree(train, test, train.columns)
    acc_tmp = fit(test, forest)

Accuracy: 0.5588235294117647
Confusion Matrix:
[[ 0  2  3  0  0]
 [ 0 25  7  0  0]
 [ 0 16 32  1  0]
 [ 0  3 11  0  0]
 [ 0  1  0  1  0]]
Classification Report:
              precision    recall  f1-score   support

           4       0.00      0.00      0.00         5
           5       0.53      0.78      0.63        32
           6       0.60      0.65      0.63        49
           7       0.00      0.00      0.00        14
           8       0.00      0.00      0.00         2

    accuracy                           0.56       102
   macro avg       0.23      0.29      0.25       102
weighted avg       0.46      0.56      0.50       102

Accuracy: 0.5588235294117647
Confusion Matrix:
[[ 0  2  3  0  0]
 [ 0 25  7  0  0]
 [ 0 16 32  1  0]
 [ 0  3 11  0  0]
 [ 0  1  0  1  0]]
Classification Report:
              precision    recall  f1-score   support

           4       0.00      0.00      0.00         5
           5       0.53      0.78      0.63        32
           6       0.60   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 0.6078431372549019
Confusion Matrix:
[[ 0  1  0  0  0  0]
 [ 0  0  1  1  0  0]
 [ 0  0 41  4  1  0]
 [ 0  1 18 18  2  0]
 [ 0  0  1  8  3  0]
 [ 0  0  0  2  0  0]]
Classification Report:
              precision    recall  f1-score   support

           3       0.00      0.00      0.00         1
           4       0.00      0.00      0.00         2
           5       0.67      0.89      0.77        46
           6       0.55      0.46      0.50        39
           7       0.50      0.25      0.33        12
           8       0.00      0.00      0.00         2

    accuracy                           0.61       102
   macro avg       0.29      0.27      0.27       102
weighted avg       0.57      0.61      0.58       102

Accuracy: 0.6078431372549019
Confusion Matrix:
[[ 0  1  0  0  0  0]
 [ 0  0  1  1  0  0]
 [ 0  0 41  4  1  0]
 [ 0  1 18 18  2  0]
 [ 0  0  1  8  3  0]
 [ 0  0  0  2  0  0]]
Classification Report:
              precision    recall  f1-score   support

        

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 0.6274509803921569
Confusion Matrix:
[[ 0  0  1  0  0  0]
 [ 0  0  2  1  0  0]
 [ 0  0 33  6  0  0]
 [ 0  0 14 29  0  0]
 [ 0  0  3  9  2  0]
 [ 0  0  0  1  1  0]]
Classification Report:
              precision    recall  f1-score   support

           3       0.00      0.00      0.00         1
           4       0.00      0.00      0.00         3
           5       0.62      0.85      0.72        39
           6       0.63      0.67      0.65        43
           7       0.67      0.14      0.24        14
           8       0.00      0.00      0.00         2

    accuracy                           0.63       102
   macro avg       0.32      0.28      0.27       102
weighted avg       0.60      0.63      0.58       102

Accuracy: 0.6274509803921569
Confusion Matrix:
[[ 0  0  1  0  0  0]
 [ 0  0  2  1  0  0]
 [ 0  0 33  6  0  0]
 [ 0  0 14 29  0  0]
 [ 0  0  3  9  2  0]
 [ 0  0  0  1  1  0]]
Classification Report:
              precision    recall  f1-score   support

        

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 0.6470588235294118
Confusion Matrix:
[[ 0  0  2  0  0  0]
 [ 0  0  3  1  0  0]
 [ 0  0 30  7  1  0]
 [ 0  0 10 32  1  0]
 [ 0  0  3  6  4  0]
 [ 0  0  0  2  0  0]]
Classification Report:
              precision    recall  f1-score   support

           3       0.00      0.00      0.00         2
           4       0.00      0.00      0.00         4
           5       0.62      0.79      0.70        38
           6       0.67      0.74      0.70        43
           7       0.67      0.31      0.42        13
           8       0.00      0.00      0.00         2

    accuracy                           0.65       102
   macro avg       0.33      0.31      0.30       102
weighted avg       0.60      0.65      0.61       102

Accuracy: 0.6470588235294118
Confusion Matrix:
[[ 0  0  2  0  0  0]
 [ 0  0  3  1  0  0]
 [ 0  0 30  7  1  0]
 [ 0  0 10 32  1  0]
 [ 0  0  3  6  4  0]
 [ 0  0  0  2  0  0]]
Classification Report:
              precision    recall  f1-score   support

        

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 0.6470588235294118
Confusion Matrix:
[[ 0  2  0  0]
 [ 0 33  7  1]
 [ 0 17 31  0]
 [ 0  2  7  2]]
Classification Report:
              precision    recall  f1-score   support

           4       0.00      0.00      0.00         2
           5       0.61      0.80      0.69        41
           6       0.69      0.65      0.67        48
           7       0.67      0.18      0.29        11

    accuracy                           0.65       102
   macro avg       0.49      0.41      0.41       102
weighted avg       0.64      0.65      0.62       102

Accuracy: 0.6470588235294118
Confusion Matrix:
[[ 0  2  0  0]
 [ 0 33  7  1]
 [ 0 17 31  0]
 [ 0  2  7  2]]
Classification Report:
              precision    recall  f1-score   support

           4       0.00      0.00      0.00         2
           5       0.61      0.80      0.69        41
           6       0.69      0.65      0.67        48
           7       0.67      0.18      0.29        11

    accuracy                   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 0.6568627450980392
Confusion Matrix:
[[ 0  4  2  0  0]
 [ 0 35 11  0  0]
 [ 1  9 29  1  0]
 [ 0  2  4  3  0]
 [ 0  0  1  0  0]]
Classification Report:
              precision    recall  f1-score   support

           4       0.00      0.00      0.00         6
           5       0.70      0.76      0.73        46
           6       0.62      0.72      0.67        40
           7       0.75      0.33      0.46         9
           8       0.00      0.00      0.00         1

    accuracy                           0.66       102
   macro avg       0.41      0.36      0.37       102
weighted avg       0.62      0.66      0.63       102

Accuracy: 0.6568627450980392
Confusion Matrix:
[[ 0  4  2  0  0]
 [ 0 35 11  0  0]
 [ 1  9 29  1  0]
 [ 0  2  4  3  0]
 [ 0  0  1  0  0]]
Classification Report:
              precision    recall  f1-score   support

           4       0.00      0.00      0.00         6
           5       0.70      0.76      0.73        46
           6       0.62   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 0.5784313725490197
Confusion Matrix:
[[ 0  0  1  0  0  0]
 [ 0  0  3  1  0  0]
 [ 0  0 37 11  0  0]
 [ 0  0 13 15  2  2]
 [ 0  0  2  7  7  0]
 [ 0  0  0  0  1  0]]
Classification Report:
              precision    recall  f1-score   support

           3       0.00      0.00      0.00         1
           4       0.00      0.00      0.00         4
           5       0.66      0.77      0.71        48
           6       0.44      0.47      0.45        32
           7       0.70      0.44      0.54        16
           8       0.00      0.00      0.00         1

    accuracy                           0.58       102
   macro avg       0.30      0.28      0.28       102
weighted avg       0.56      0.58      0.56       102

Accuracy: 0.5784313725490197
Confusion Matrix:
[[ 0  0  1  0  0  0]
 [ 0  0  3  1  0  0]
 [ 0  0 37 11  0  0]
 [ 0  0 13 15  2  2]
 [ 0  0  2  7  7  0]
 [ 0  0  0  0  1  0]]
Classification Report:
              precision    recall  f1-score   support

        

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 0.5882352941176471
Confusion Matrix:
[[ 0  1  1  0  0]
 [ 0 33 11  0  0]
 [ 0 18 24  0  0]
 [ 0  1  9  3  0]
 [ 0  0  1  0  0]]
Classification Report:
              precision    recall  f1-score   support

           4       0.00      0.00      0.00         2
           5       0.62      0.75      0.68        44
           6       0.52      0.57      0.55        42
           7       1.00      0.23      0.38        13
           8       0.00      0.00      0.00         1

    accuracy                           0.59       102
   macro avg       0.43      0.31      0.32       102
weighted avg       0.61      0.59      0.57       102

Accuracy: 0.5882352941176471
Confusion Matrix:
[[ 0  1  1  0  0]
 [ 0 33 11  0  0]
 [ 0 18 24  0  0]
 [ 0  1  9  3  0]
 [ 0  0  1  0  0]]
Classification Report:
              precision    recall  f1-score   support

           4       0.00      0.00      0.00         2
           5       0.62      0.75      0.68        44
           6       0.52   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 0.6862745098039216
Confusion Matrix:
[[ 0  1  0  0  0]
 [ 0 39  5  0  0]
 [ 0 16 25  4  0]
 [ 0  3  2  6  0]
 [ 0  0  1  0  0]]
Classification Report:
              precision    recall  f1-score   support

           4       0.00      0.00      0.00         1
           5       0.66      0.89      0.76        44
           6       0.76      0.56      0.64        45
           7       0.60      0.55      0.57        11
           8       0.00      0.00      0.00         1

    accuracy                           0.69       102
   macro avg       0.40      0.40      0.39       102
weighted avg       0.68      0.69      0.67       102

Accuracy: 0.6862745098039216
Confusion Matrix:
[[ 0  1  0  0  0]
 [ 0 39  5  0  0]
 [ 0 16 25  4  0]
 [ 0  3  2  6  0]
 [ 0  0  1  0  0]]
Classification Report:
              precision    recall  f1-score   support

           4       0.00      0.00      0.00         1
           5       0.66      0.89      0.76        44
           6       0.76   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 0.5882352941176471
Confusion Matrix:
[[ 0  0  1  2  0  0]
 [ 0  1  0  3  0  0]
 [ 0  0 36 12  1  0]
 [ 0  0  9 21  2  1]
 [ 0  0  0  9  2  0]
 [ 0  0  0  1  1  0]]
Classification Report:
              precision    recall  f1-score   support

           3       0.00      0.00      0.00         3
           4       1.00      0.25      0.40         4
           5       0.78      0.73      0.76        49
           6       0.44      0.64      0.52        33
           7       0.33      0.18      0.24        11
           8       0.00      0.00      0.00         2

    accuracy                           0.59       102
   macro avg       0.43      0.30      0.32       102
weighted avg       0.59      0.59      0.57       102

Accuracy: 0.5882352941176471
Confusion Matrix:
[[ 0  0  1  2  0  0]
 [ 0  1  0  3  0  0]
 [ 0  0 36 12  1  0]
 [ 0  0  9 21  2  1]
 [ 0  0  0  9  2  0]
 [ 0  0  0  1  1  0]]
Classification Report:
              precision    recall  f1-score   support

        

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [16]:
df_x = pd.read_excel("Dataset1/train/X_train.xlsx")
df_y = pd.read_excel("Dataset1/train/y_train.xlsx")
#df_x = fill_missing_value_mean(df_x)
df_x = normal(df_x)
df_x = distance(df_x)
test_set = pd.read_excel("Dataset1_test/X_test.xlsx")
test_set = fill_missing_value_mean(test_set)

ss = StratifiedShuffleSplit(n_splits = 1, test_size = 0.3, train_size = 0.7, random_state = 0)
train = pd.DataFrame()
test  = pd.DataFrame()
for train_idx, test_idx in ss.split(df_x, df_y):
    for j in train_idx:
        train = train.append(df_x[j:j + 1].join(df_y[j:j + 1]))
    for j in test_idx:
        test = test.append(df_x[j:j + 1].join(df_y[j:j + 1]))

model = Decision_Tree(train, test, train.columns)
acc = fit(test, model)

predict_ans = pd.DataFrame()
for i, row in test_set.iterrows():
    predict = []
    for j in model:
        re = apply(row, j)
        predict.append(re)
    #print(predict)
    vote = max(set(predict), key = predict.count)
    tmp = pd.DataFrame([vote], columns = ['return'])
    predict_ans = predict_ans.append(tmp)
    
writer = pd.ExcelWriter('./test_predict_dataset1.xlsx')
predict_ans.to_excel(writer,index=False)
print('done')
writer.save()

Accuracy: 0.5993485342019544
Confusion Matrix:
[[ 0  0  3  0  0  0]
 [ 0  0  7  3  0  0]
 [ 1  0 90 36  1  0]
 [ 0  0 34 82  8  1]
 [ 0  0  5 20 12  0]
 [ 0  0  0  2  2  0]]
Classification Report:
              precision    recall  f1-score   support

           3       0.00      0.00      0.00         3
           4       0.00      0.00      0.00        10
           5       0.65      0.70      0.67       128
           6       0.57      0.66      0.61       125
           7       0.52      0.32      0.40        37
           8       0.00      0.00      0.00         4

    accuracy                           0.60       307
   macro avg       0.29      0.28      0.28       307
weighted avg       0.57      0.60      0.58       307

Accuracy: 0.5993485342019544
Confusion Matrix:
[[ 0  0  3  0  0  0]
 [ 0  0  7  3  0  0]
 [ 1  0 90 36  1  0]
 [ 0  0 34 82  8  1]
 [ 0  0  5 20 12  0]
 [ 0  0  0  2  2  0]]
Classification Report:
              precision    recall  f1-score   support

        

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [11]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from bs4 import BeautifulSoup
import re
from tqdm import tqdm
import nltk
nltk.download('omw-1.4')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence

import warnings
warnings.filterwarnings('ignore', category=UserWarning, module='bs4')

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [12]:
def preprocess_data(df):
    reviews = []
    for raw in tqdm(df['Phrase']):
        text = re.sub('[^a-zA-Z]', ' ', str(raw))
        words = word_tokenize(text.lower())
        stops = set(stopwords.words('english'))
        non_stopwords = [word for word in words if not word in stops]
        lemma_words = [lemmatizer.lemmatize(word) for word in non_stopwords]    
        reviews.append(lemma_words)
    return reviews

In [13]:
def tokenizer_preprocess(list_X_train, list_X_val, list_test):
    unique_words = set()
    len_max = 0
    for sent in tqdm(list_X_train):
        unique_words.update(sent)
        if len_max < len(sent):
            len_max = len(sent)
    len(list(unique_words)), len_max

    tokenizer = Tokenizer(num_words=len(list(unique_words)))
    tokenizer.fit_on_texts(list(list_X_train))
     
    X_train = tokenizer.texts_to_sequences(list_X_train)
    X_train = sequence.pad_sequences(X_train, maxlen=len_max)

    X_val = tokenizer.texts_to_sequences(list_X_val)
    X_val = sequence.pad_sequences(X_val, maxlen=len_max)
    
    X_test = tokenizer.texts_to_sequences(list_test)
    X_test = sequence.pad_sequences(X_test, maxlen=len_max)

    return X_train, X_val, X_test

In [14]:
df_x = pd.read_excel("Dataset2/Dataset2_train/X_train.xlsx")
df_y = pd.read_excel("Dataset2/Dataset2_train/y_train.xlsx")
test_set = pd.read_excel("Dataset2_test/X_test.xlsx")

In [15]:
df_x = preprocess_data(df_x)
X_test = preprocess_data(test_set)

100%|████████████████████████████████████████████████████████████████████████| 124848/124848 [01:09<00:00, 1800.19it/s]
100%|██████████████████████████████████████████████████████████████████████████| 31212/31212 [00:16<00:00, 1910.74it/s]


In [16]:
X_train, X_val, y_train, y_val = train_test_split(df_x, df_y, test_size=0.2, stratify=df_y)

In [17]:
X_train_, X_val_, X_test_ = tokenizer_preprocess(X_train, X_val, X_test) 

100%|████████████████████████████████████████████████████████████████████████| 99878/99878 [00:00<00:00, 600226.80it/s]


In [18]:
y_list = list(y_train['Sentiment'])

In [19]:
feature_set = ['f0', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9','f10', 'f11', 'f12', 'f13', 'f14', 'f15', 'f16', 'f17', 'f18', 'f19', 'f20', 'f21', 'f22', 'f23', 'f24', 'f25', 'f26', 'f27', 'f28', 'class']
train = pd.DataFrame()
test  = pd.DataFrame()

for i in tqdm(range(len(X_train_))):
    feature_val = []
    for j in range(29):
        feature_val.append(X_train_[i][j])
    feature_val.append(y_list[i])
    tmp = pd.DataFrame([feature_val], columns = feature_set)
    train = train.append(tmp, ignore_index=True)

100%|███████████████████████████████████████████████████████████████████████████| 99878/99878 [07:22<00:00, 225.90it/s]


In [20]:
feature_test = ['f0', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9','f10', 'f11', 'f12', 'f13', 'f14', 'f15', 'f16', 'f17', 'f18', 'f19', 'f20', 'f21', 'f22', 'f23', 'f24', 'f25', 'f26', 'f27', 'f28']
y_set = pd.DataFrame()
for i in tqdm(range(len(X_test_))):
    feature_val = []
    for j in range(29):
        feature_val.append(X_test_[i][j])
    tmp = pd.DataFrame([feature_val], columns = feature_test)
    y_set = y_set.append(tmp, ignore_index=True)

100%|███████████████████████████████████████████████████████████████████████████| 31212/31212 [01:14<00:00, 421.08it/s]


In [21]:
print(y_set)

       f0  f1  f2  f3  f4  f5  f6  f7  f8  f9  ...  f19  f20  f21   f22   f23  \
0       0   0   0   0   0   0   0   0   0   0  ...   93   16    9  1081  2015   
1       0   0   0   0   0   0   0   0   0   0  ...    0    0    0     0     0   
2       0   0   0   0   0   0   0   0   0   0  ...    0    0    0     0     0   
3       0   0   0   0   0   0   0   0   0   0  ...    0    0    0     0     0   
4       0   0   0   0   0   0   0   0   0   0  ...    0    0    0     0     0   
...    ..  ..  ..  ..  ..  ..  ..  ..  ..  ..  ...  ...  ...  ...   ...   ...   
31207   0   0   0   0   0   0   0   0   0   0  ...    0    0    0     0     0   
31208   0   0   0   0   0   0   0   0   0   0  ...    0    0    0     0     0   
31209   0   0   0   0   0   0   0   0   0   0  ...    0    0    0     0     0   
31210   0   0   0   0   0   0   0   0   0   0  ...    0    0    0     0     0   
31211   0   0   0   0   0   0   0   0   0   0  ...    0    0    0     0     0   

        f24    f25   f26   

In [22]:
y_list = list(y_val['Sentiment'])

for i in tqdm(range(len(X_val_))):
    feature_val = []
    for j in range(29):
        feature_val.append(X_val_[i][j])
    feature_val.append(y_list[i])
    tmp = pd.DataFrame([feature_val], columns = feature_set)
    test = test.append(tmp, ignore_index=True)

100%|███████████████████████████████████████████████████████████████████████████| 24970/24970 [01:01<00:00, 409.33it/s]


In [23]:
model = Decision_Tree(train, test, train.columns)
acc = fit(test, model)

Accuracy: 0.5559070885062074
Confusion Matrix:
[[  144   329   564    87     8]
 [  171  1008  2927   239    19]
 [   69   627 11518   497    22]
 [   35   317  3736  1070   110]
 [   19   101   831   381   141]]
Classification Report:
              precision    recall  f1-score   support

           0       0.33      0.13      0.18      1132
           1       0.42      0.23      0.30      4364
           2       0.59      0.90      0.71     12733
           3       0.47      0.20      0.28      5268
           4       0.47      0.10      0.16      1473

    accuracy                           0.56     24970
   macro avg       0.46      0.31      0.33     24970
weighted avg       0.52      0.56      0.49     24970

Accuracy: 0.5559070885062074
Confusion Matrix:
[[  144   329   564    87     8]
 [  171  1008  2927   239    19]
 [   69   627 11518   497    22]
 [   35   317  3736  1070   110]
 [   19   101   831   381   141]]
Classification Report:
              precision    recall  f1-s

In [24]:
predict_ans2 = pd.DataFrame()
for i, row in tqdm(y_set.iterrows()):
    predict2 = []
    for j in model:
        re = apply(row, j)
        predict2.append(re)
    vote = max(set(predict2), key = predict2.count)
    tmp = pd.DataFrame([vote], columns = ['return'])
    predict_ans2 = predict_ans2.append(tmp)
    
writer = pd.ExcelWriter('./test_predict_dataset2.xlsx')
predict_ans2.to_excel(writer,index=False)
print('done')
writer.save()

31212it [00:23, 1307.70it/s]


done


In [55]:
df_x = pd.read_excel("Dataset1/train/X_train.xlsx")
df_y = pd.read_excel("Dataset1/train/y_train.xlsx")
#df_x = fill_missing_value_mean(df_x)
df_x = normal(df_x)
df_x = distance(df_x)
test_set = pd.read_excel("Dataset1_test/X_test.xlsx")
test_set = fill_missing_value_mean(test_set)

ss = StratifiedShuffleSplit(n_splits = 1, test_size = 0.3, train_size = 0.7, random_state = 0)
train = pd.DataFrame()
test  = pd.DataFrame()
for train_idx, test_idx in ss.split(df_x, df_y):
    for j in train_idx:
        train = train.append(df_x[j:j + 1].join(df_y[j:j + 1]))
    for j in test_idx:
        test = test.append(df_x[j:j + 1].join(df_y[j:j + 1]))

writer = pd.ExcelWriter('./train_relation_dataset1.xlsx')
train.corr().to_excel(writer,index=False)
print('done')
writer.save()

done
