## CS140 - Group B


In [0]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn.base import TransformerMixin
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.naive_bayes import BernoulliNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report 
from sklearn.ensemble import RandomForestClassifier

In [0]:
TAGS = ['CC', 'CD', 'DT', 'EX', 'FW', 'IN', 'JJ', 'JJR', 'JJS', 'LS', 'MD', 'NN', 'NNS', 'NNP', 'NNPS', 'PDT', 'POS', 'PRP', 'PRP$', 'RB', 'RBR', 'RBS', 'RP', 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 'WP', 'WP$', 'WRB']
MODALS = ['will', 'would', 'can', 'could', 'may', 'might']
TEMPS = ['before', 'after', 'since']

In [0]:
df = pd.read_csv('https://raw.githubusercontent.com/YonglinWang-Brandeis/cs140-groupB-annotation-files/master/code/features_combine.csv')

In [56]:
df.head()


Unnamed: 0,POS1,POS2,BEFORE_1,AFTER_1,BEFORE_2,AFTER_2,DISTANCE,MODAL,TEMP_BEFORE,TEMP_AFTER,TEMP_SINCE,SYN,DERIVATION,RELATION
0,VBD,VBG,NNP,PRP,VBD,NNS,3,0,0.0,0.0,0.0,0,0,after
1,VBD,VB,NNP,PRP,MD,IN,12,1,0.0,0.0,0.0,0,0,after
2,VBG,VB,VBD,NNS,MD,IN,9,1,0.0,0.0,0.0,0,0,after
3,VBZ,VBD,EX,DT,NNP,PRP,6,0,0.0,0.0,0.0,0,0,vague
4,VBZ,VBD,EX,DT,NNS,NNP,16,0,0.0,0.0,0.0,0,0,vague


In [67]:
df.shape

(4467, 15)

In [0]:
def encode_labels(label):
    if label == 'vague':
        return 0
    elif label == 'before':
        return 1
    elif label == 'after':
        return 2
    else:
        return 3

In [0]:
df_copy = df.copy()
df['y'] = df_copy.RELATION.apply(encode_labels)

In [70]:
df.head()

Unnamed: 0,POS1,POS2,BEFORE_1,AFTER_1,BEFORE_2,AFTER_2,DISTANCE,MODAL,TEMP_BEFORE,TEMP_AFTER,TEMP_SINCE,SYN,DERIVATION,RELATION,y
0,VBD,VBG,NNP,PRP,VBD,NNS,3,0,0.0,0.0,0.0,0,0,after,2
1,VBD,VB,NNP,PRP,MD,IN,12,1,0.0,0.0,0.0,0,0,after,2
2,VBG,VB,VBD,NNS,MD,IN,9,1,0.0,0.0,0.0,0,0,after,2
3,VBZ,VBD,EX,DT,NNP,PRP,6,0,0.0,0.0,0.0,0,0,vague,0
4,VBZ,VBD,EX,DT,NNS,NNP,16,0,0.0,0.0,0.0,0,0,vague,0


In [71]:
print(list(df.columns) )

['POS1', 'POS2', 'BEFORE_1', 'AFTER_1', 'BEFORE_2', 'AFTER_2', 'DISTANCE', 'MODAL', 'TEMP_BEFORE', 'TEMP_AFTER', 'TEMP_SINCE', 'SYN', 'DERIVATION', 'RELATION', 'y']


In [0]:
y_data = df['y']
# x_data = df[['POS1', 'POS2', 'BEFORE_1', 'AFTER_1', 'BEFORE_2', 'AFTER_2', 'DISTANCE', 'MODAL', 'TEMP_BEFORE', 'TEMP_AFTER', 'TEMP_SINCE', 'SYN', 'DERIVATION']]
x_data = df[[ 'DISTANCE', 'MODAL', 'TEMP_BEFORE', 'TEMP_AFTER', 'TEMP_SINCE', 'SYN', 'DERIVATION']]

In [0]:
X_train, X_test, y_train, y_test = train_test_split(x_data, y_data, train_size=0.8, random_state=0)

In [74]:
from collections import Counter
print('dataset shape {}'.format(Counter(y_train)))

dataset shape Counter({0: 1797, 1: 906, 3: 438, 2: 432})


In [99]:
print('dataset shape {}'.format(Counter(y_data)))

dataset shape Counter({0: 2238, 1: 1128, 2: 558, 3: 543})


In [14]:
!pip install imbalanced-learn



In [15]:
import imblearn
print(imblearn.__version__)

0.4.3




In [0]:
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split as tts

In [132]:

from collections import Counter
print('dataset shape {}'.format(Counter(y_data)))
print('dataset shape {}'.format(Counter(y_train)))

dataset shape Counter({0: 2238, 1: 1128, 2: 558, 3: 543})
dataset shape Counter({0: 1797, 1: 906, 3: 438, 2: 432})


# Naíve Bayes

In [0]:
pipeline = Pipeline([    
    ('clf', BernoulliNB()),
])

pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

In [0]:

def test_model(X_test,y_test, model):
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_pred, y_test)
    report = classification_report(y_pred, y_test)
    matrix = pd.DataFrame(confusion_matrix(y_test, y_pred),
                columns=['p_VAGE', 'p_BEFORE', 'p_AFTER','p_SIMU'], index=['VAGUE', 'BEFORE', 'AFTER','SIMU'])
    return accuracy, report, matrix

accuracy, report, matrix = test_model(X_test, y_test, pipeline)
print("the accuracy of the model is {accuracy:.{digits}f}".format(accuracy=accuracy, digits=2))
print()
print(report)
print()
print(matrix)

the accuracy of the model is 0.50

              precision    recall  f1-score   support

           0       0.99      0.50      0.66       882
           1       0.00      0.00      0.00         0
           2       0.00      0.00      0.00         0
           3       0.07      0.58      0.12        12

    accuracy                           0.50       894
   macro avg       0.26      0.27      0.20       894
weighted avg       0.98      0.50      0.65       894


        p_VAGE  p_BEFORE  p_AFTER  p_SIMU
VAGUE      438         0        0       3
BEFORE     220         0        0       2
AFTER      126         0        0       0
SIMU        98         0        0       7


  _warn_prf(average, modifier, msg_start, len(result))


SMOTE

In [131]:
pipeline = Pipeline([    

     ('clf', BernoulliNB()),
])

# smo = SMOTE(sampling_strategy={2:600, 3:600})
smo = SMOTE()
X_smo, y_smo = smo.fit_sample(X_train, y_train)

from collections import Counter
print('dataset shape {}'.format(Counter(y_smo)))
pipeline.fit(X_smo, y_smo)
y_pred = pipeline.predict(X_test)

def test_model(X_test,y_test, model):
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_pred, y_test)
    report = classification_report(y_pred, y_test)
    matrix = pd.DataFrame(confusion_matrix(y_test, y_pred),
                columns=['p_VAGE', 'p_BEFORE', 'p_AFTER','p_SIMU'], index=['VAGUE', 'BEFORE', 'AFTER','SIMU'])
    return accuracy, report, matrix

accuracy, report, matrix = test_model(X_test, y_test, pipeline)
print("the accuracy of the model is {accuracy:.{digits}f}".format(accuracy=accuracy, digits=2))
print()
print(report)
print()
print(matrix)

dataset shape Counter({1: 1797, 0: 1797, 3: 1797, 2: 1797})
the accuracy of the model is 0.26

              precision    recall  f1-score   support

           0       0.26      0.54      0.35       211
           1       0.07      0.35      0.12        46
           2       0.66      0.16      0.25       530
           3       0.23      0.22      0.23       107

    accuracy                           0.26       894
   macro avg       0.30      0.32      0.24       894
weighted avg       0.48      0.26      0.27       894


        p_VAGE  p_BEFORE  p_AFTER  p_SIMU
VAGUE      113        20      251      57
BEFORE      57        16      132      17
AFTER       25         9       83       9
SIMU        16         1       64      24




# Logistic Regression

In [0]:
pipeline = Pipeline([    

  ('clf', LogisticRegression(
      solver='newton-cg',  
      multi_class='multinomial', 
      )),
])


pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

In [0]:

def test_model(X_test,y_test, model):
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_pred, y_test)
    report = classification_report(y_pred, y_test)
    matrix = pd.DataFrame(confusion_matrix(y_test, y_pred),
                columns=['p_VAGE', 'p_BEFORE', 'p_AFTER','p_SIMU'], index=['VAGUE', 'BEFORE', 'AFTER','SIMU'])
    return accuracy, report, matrix

accuracy, report, matrix = test_model(X_test, y_test, pipeline)
print("the accuracy of the model is {accuracy:.{digits}f}".format(accuracy=accuracy, digits=2))
print()
print(report)
print()
print(matrix)

the accuracy of the model is 0.50

              precision    recall  f1-score   support

           0       0.99      0.50      0.66       872
           1       0.00      0.09      0.01        11
           2       0.00      0.00      0.00         0
           3       0.06      0.55      0.10        11

    accuracy                           0.50       894
   macro avg       0.26      0.28      0.19       894
weighted avg       0.97      0.50      0.65       894


        p_VAGE  p_BEFORE  p_AFTER  p_SIMU
VAGUE      436         2        0       3
BEFORE     219         1        0       2
AFTER      119         7        0       0
SIMU        98         1        0       6


  _warn_prf(average, modifier, msg_start, len(result))


## SMOTE

In [129]:
pipeline = Pipeline([    

  ('clf', LogisticRegression(
      solver='newton-cg',  
      multi_class='multinomial', 
      )),
])

smo = SMOTE(sampling_strategy={2:600, 3:600})
# smo = SMOTE()
X_smo, y_smo = smo.fit_sample(X_train, y_train)

from collections import Counter
print('dataset shape {}'.format(Counter(y_smo)))
pipeline.fit(X_smo, y_smo)
y_pred = pipeline.predict(X_test)

def test_model(X_test,y_test, model):
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_pred, y_test)
    report = classification_report(y_pred, y_test)
    matrix = pd.DataFrame(confusion_matrix(y_test, y_pred),
                columns=['p_VAGE', 'p_BEFORE', 'p_AFTER','p_SIMU'], index=['VAGUE', 'BEFORE', 'AFTER','SIMU'])
    return accuracy, report, matrix

accuracy, report, matrix = test_model(X_test, y_test, pipeline)
print("the accuracy of the model is {accuracy:.{digits}f}".format(accuracy=accuracy, digits=2))
print()
print(report)
print()
print(matrix)



dataset shape Counter({0: 1797, 1: 906, 3: 900, 2: 900})
the accuracy of the model is 0.50

              precision    recall  f1-score   support

           0       0.99      0.50      0.66       872
           1       0.00      0.10      0.01        10
           2       0.00      0.00      0.00         0
           3       0.07      0.58      0.12        12

    accuracy                           0.50       894
   macro avg       0.26      0.30      0.20       894
weighted avg       0.97      0.50      0.65       894


        p_VAGE  p_BEFORE  p_AFTER  p_SIMU
VAGUE      436         2        0       3
BEFORE     219         1        0       2
AFTER      119         7        0       0
SIMU        98         0        0       7


  _warn_prf(average, modifier, msg_start, len(result))


# Random Forest

In [140]:
pipeline = Pipeline([    
  ('clf', RandomForestClassifier(n_estimators = 43,
                        min_samples_split = 20,
                        max_depth = 250))
])

from collections import Counter
print('dataset shape {}'.format(Counter(y_train)))
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

dataset shape Counter({0: 1797, 1: 906, 3: 438, 2: 432})


In [141]:

def test_model(X_test,y_test, model):
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_pred, y_test)
    report = classification_report(y_pred, y_test)
    matrix = pd.DataFrame(confusion_matrix(y_test, y_pred),
                columns=['p_VAGE', 'p_BEFORE', 'p_AFTER','p_SIMU'], index=['VAGUE', 'BEFORE', 'AFTER','SIMU'])
    return accuracy, report, matrix

accuracy, report, matrix = test_model(X_test, y_test, pipeline)
print("the accuracy of the model is {accuracy:.{digits}f}".format(accuracy=accuracy, digits=2))
print()
print(report)
print()
print(matrix)

the accuracy of the model is 0.49

              precision    recall  f1-score   support

           0       0.87      0.51      0.64       762
           1       0.16      0.37      0.23        97
           2       0.05      0.38      0.08        16
           3       0.08      0.42      0.13        19

    accuracy                           0.49       894
   macro avg       0.29      0.42      0.27       894
weighted avg       0.76      0.49      0.57       894


        p_VAGE  p_BEFORE  p_AFTER  p_SIMU
VAGUE      385        46        5       5
BEFORE     179        36        3       4
AFTER      109         9        6       2
SIMU        89         6        2       8


## Random Forest SMOTE

In [145]:
pipeline = Pipeline([    
  ('clf', RandomForestClassifier(n_estimators = 43,
                        min_samples_split = 2,
                        max_depth = 500))
])

smo = SMOTE({1:1000, 2:600, 3:500})
X_smo, y_smo = smo.fit_sample(X_train, y_train)

from collections import Counter
print('dataset shape {}'.format(Counter(y_smo)))
pipeline.fit(X_smo, y_smo)
y_pred = pipeline.predict(X_test)


def test_model(X_test,y_test, model):
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_pred, y_test)
    report = classification_report(y_pred, y_test)
    matrix = pd.DataFrame(confusion_matrix(y_test, y_pred),
                columns=['p_VAGE', 'p_BEFORE', 'p_AFTER','p_SIMU'], index=['VAGUE', 'BEFORE', 'AFTER','SIMU'])
    return accuracy, report, matrix

accuracy, report, matrix = test_model(X_test, y_test, pipeline)
print("the accuracy of the model is {accuracy:.{digits}f}".format(accuracy=accuracy, digits=2))
print()
print(report)
print()
print(matrix)

dataset shape Counter({0: 1797, 1: 1000, 3: 800, 2: 800})
the accuracy of the model is 0.45

              precision    recall  f1-score   support

           0       0.76      0.50      0.61       662
           1       0.16      0.36      0.22       101
           2       0.11      0.22      0.15        64
           3       0.17      0.27      0.21        67

    accuracy                           0.45       894
   macro avg       0.30      0.34      0.30       894
weighted avg       0.60      0.45      0.50       894


        p_VAGE  p_BEFORE  p_AFTER  p_SIMU
VAGUE      334        54       28      25
BEFORE     154        36       13      19
AFTER       99         8       14       5
SIMU        75         3        9      18




## GradientBoosting

In [0]:
from sklearn.ensemble import GradientBoostingClassifier

In [98]:
pipeline = Pipeline([    
  ('clf', GradientBoostingClassifier())
])

smo = SMOTE('minority')
X_smo, y_smo = smo.fit_sample(X_train, y_train)

from collections import Counter
print('dataset shape {}'.format(Counter(y_train)))
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

def test_model(X_test,y_test, model):
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_pred, y_test)
    report = classification_report(y_pred, y_test)
    matrix = pd.DataFrame(confusion_matrix(y_test, y_pred),
                columns=['p_VAGE', 'p_BEFORE', 'p_AFTER','p_SIMU'], index=['VAGUE', 'BEFORE', 'AFTER','SIMU'])
    return accuracy, report, matrix

accuracy, report, matrix = test_model(X_test, y_test, pipeline)
print("the accuracy of the model is {accuracy:.{digits}f}".format(accuracy=accuracy, digits=2))
print()
print(report)
print()
print(matrix)



dataset shape Counter({0: 1797, 1: 906, 3: 438, 2: 432})
the accuracy of the model is 0.49

              precision    recall  f1-score   support

           0       0.93      0.50      0.65       820
           1       0.08      0.32      0.13        57
           2       0.04      0.83      0.08         6
           3       0.07      0.64      0.12        11

    accuracy                           0.49       894
   macro avg       0.28      0.57      0.24       894
weighted avg       0.85      0.49      0.60       894


        p_VAGE  p_BEFORE  p_AFTER  p_SIMU
VAGUE      408        30        1       2
BEFORE     202        18        0       2
AFTER      115         6        5       0
SIMU        95         3        0       7


 ## GradientBoosting SMOTE

In [146]:
pipeline = Pipeline([    
  ('clf', GradientBoostingClassifier())
])

smo = SMOTE(sampling_strategy={1:1000, 2:600, 3:500})
# smo = SMOTE()
X_smo, y_smo = smo.fit_sample(X_train, y_train)

from collections import Counter
print('dataset shape {}'.format(Counter(y_train)))
print('dataset shape {}'.format(Counter(y_smo)))
pipeline.fit(X_smo, y_smo)
y_pred = pipeline.predict(X_test)

def test_model(X_test,y_test, model):
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_pred, y_test)
    report = classification_report(y_pred, y_test)
    matrix = pd.DataFrame(confusion_matrix(y_test, y_pred),
                columns=['p_VAGE', 'p_BEFORE', 'p_AFTER','p_SIMU'], index=['VAGUE', 'BEFORE', 'AFTER','SIMU'])
    return accuracy, report, matrix

accuracy, report, matrix = test_model(X_test, y_test, pipeline)
print("the accuracy of the model is {accuracy:.{digits}f}".format(accuracy=accuracy, digits=2))
print()
print(report)
print()
print(matrix)



dataset shape Counter({0: 1797, 1: 906, 3: 438, 2: 432})
dataset shape Counter({0: 1797, 1: 1000, 2: 600, 3: 500})
the accuracy of the model is 0.49

              precision    recall  f1-score   support

           0       0.91      0.50      0.64       807
           1       0.09      0.33      0.15        63
           2       0.04      0.56      0.07         9
           3       0.08      0.53      0.13        15

    accuracy                           0.49       894
   macro avg       0.28      0.48      0.25       894
weighted avg       0.83      0.49      0.59       894


        p_VAGE  p_BEFORE  p_AFTER  p_SIMU
VAGUE      401        33        2       5
BEFORE     198        21        1       2
AFTER      115         6        5       0
SIMU        93         3        1       8
