## CS140 - Group B


In [0]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn.base import TransformerMixin
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.naive_bayes import BernoulliNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report 
from sklearn.ensemble import RandomForestClassifier

In [0]:
TAGS = ['CC', 'CD', 'DT', 'EX', 'FW', 'IN', 'JJ', 'JJR', 'JJS', 'LS', 'MD', 'NN', 'NNS', 'NNP', 'NNPS', 'PDT', 'POS', 'PRP', 'PRP$', 'RB', 'RBR', 'RBS', 'RP', 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 'WP', 'WP$', 'WRB']
MODALS = ['will', 'would', 'can', 'could', 'may', 'might']
TEMPS = ['before', 'after', 'since']

In [0]:
df = pd.read_csv('https://raw.githubusercontent.com/YonglinWang-Brandeis/cs140-groupB-annotation-files/master/code/features_combine.csv')

In [45]:
df.head()


Unnamed: 0,POS1,POS2,BEFORE_1,AFTER_1,BEFORE_2,AFTER_2,DISTANCE,MODAL,TEMP_BEFORE,TEMP_AFTER,TEMP_SINCE,SYN,DERIVATION,RELATION
0,VBD,VBG,NNP,PRP,VBD,NNS,3,0,0.0,0.0,0.0,0,0,after
1,VBD,VB,NNP,PRP,MD,IN,12,1,0.0,0.0,0.0,0,0,after
2,VBG,VB,VBD,NNS,MD,IN,9,1,0.0,0.0,0.0,0,0,after
3,VBZ,VBD,EX,DT,NNP,PRP,6,0,0.0,0.0,0.0,0,0,vague
4,VBZ,VBD,EX,DT,NNS,NNP,16,0,0.0,0.0,0.0,0,0,vague


In [46]:
df.shape

(4467, 14)

In [0]:
def encode_labels(label):
    if label == 'before':
        return 0
    elif label == 'after':
        return 1
    elif label == 'simultaneous':
        return 2
    else:
        return 3

In [0]:
df_copy = df.copy()
df['y'] = df_copy.RELATION.apply(encode_labels)

In [49]:
df.head()

Unnamed: 0,POS1,POS2,BEFORE_1,AFTER_1,BEFORE_2,AFTER_2,DISTANCE,MODAL,TEMP_BEFORE,TEMP_AFTER,TEMP_SINCE,SYN,DERIVATION,RELATION,y
0,VBD,VBG,NNP,PRP,VBD,NNS,3,0,0.0,0.0,0.0,0,0,after,1
1,VBD,VB,NNP,PRP,MD,IN,12,1,0.0,0.0,0.0,0,0,after,1
2,VBG,VB,VBD,NNS,MD,IN,9,1,0.0,0.0,0.0,0,0,after,1
3,VBZ,VBD,EX,DT,NNP,PRP,6,0,0.0,0.0,0.0,0,0,vague,3
4,VBZ,VBD,EX,DT,NNS,NNP,16,0,0.0,0.0,0.0,0,0,vague,3


In [0]:
df = df[~ df.isin([3])]

In [51]:
df.head()

Unnamed: 0,POS1,POS2,BEFORE_1,AFTER_1,BEFORE_2,AFTER_2,DISTANCE,MODAL,TEMP_BEFORE,TEMP_AFTER,TEMP_SINCE,SYN,DERIVATION,RELATION,y
0,VBD,VBG,NNP,PRP,VBD,NNS,,0.0,0.0,0.0,0.0,0,0,after,1.0
1,VBD,VB,NNP,PRP,MD,IN,12.0,1.0,0.0,0.0,0.0,0,0,after,1.0
2,VBG,VB,VBD,NNS,MD,IN,9.0,1.0,0.0,0.0,0.0,0,0,after,1.0
3,VBZ,VBD,EX,DT,NNP,PRP,6.0,0.0,0.0,0.0,0.0,0,0,vague,
4,VBZ,VBD,EX,DT,NNS,NNP,16.0,0.0,0.0,0.0,0.0,0,0,vague,


In [0]:
df['y']=df['y'].fillna('null')
df=df[~df['y'].isin(['null'])]
df = df.dropna(axis=0,how='any')

In [78]:
df.head()

Unnamed: 0,POS1,POS2,BEFORE_1,AFTER_1,BEFORE_2,AFTER_2,DISTANCE,MODAL,TEMP_BEFORE,TEMP_AFTER,TEMP_SINCE,SYN,DERIVATION,RELATION,y
1,VBD,VB,NNP,PRP,MD,IN,12.0,1.0,0.0,0.0,0.0,0,0,after,1.0
2,VBG,VB,VBD,NNS,MD,IN,9.0,1.0,0.0,0.0,0.0,0,0,after,1.0
5,VBZ,VBN,EX,DT,RB,RBR,42.0,0.0,0.0,0.0,0.0,1,0,after,1.0
6,VBZ,VBP,EX,DT,PRP,JJR,47.0,0.0,0.0,0.0,0.0,0,0,after,1.0
8,VBZ,VBZ,EX,DT,WDT,RBR,61.0,0.0,0.0,0.0,0.0,1,1,simultaneous,2.0


In [79]:
print(list(df.columns) )

['POS1', 'POS2', 'BEFORE_1', 'AFTER_1', 'BEFORE_2', 'AFTER_2', 'DISTANCE', 'MODAL', 'TEMP_BEFORE', 'TEMP_AFTER', 'TEMP_SINCE', 'SYN', 'DERIVATION', 'RELATION', 'y']


In [0]:
y_data = df['y']
# x_data = df[['POS1', 'POS2', 'BEFORE_1', 'AFTER_1', 'BEFORE_2', 'AFTER_2', 'DISTANCE', 'MODAL', 'TEMP_BEFORE', 'TEMP_AFTER', 'TEMP_SINCE', 'SYN', 'DERIVATION']]
x_data = df[[ 'DISTANCE', 'MODAL', 'TEMP_BEFORE', 'TEMP_AFTER', 'TEMP_SINCE', 'SYN', 'DERIVATION']]

In [0]:
X_train, X_test, y_train, y_test = train_test_split(x_data, y_data, train_size=0.8, random_state=0)

In [82]:
from collections import Counter
print('dataset shape {}'.format(Counter(y_train)))

dataset shape Counter({0.0: 861, 1.0: 439, 2.0: 381})


In [83]:
print('dataset shape {}'.format(Counter(y_data)))

dataset shape Counter({0.0: 1086, 1.0: 545, 2.0: 471})


In [84]:
df.shape

(2102, 15)

In [85]:
!pip install imbalanced-learn



In [86]:
import imblearn
print(imblearn.__version__)

0.4.3


In [0]:
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split as tts

In [88]:

from collections import Counter
print('dataset shape {}'.format(Counter(y_data)))
print('dataset shape {}'.format(Counter(y_train)))

dataset shape Counter({0.0: 1086, 1.0: 545, 2.0: 471})
dataset shape Counter({0.0: 861, 1.0: 439, 2.0: 381})


# Naíve Bayes

In [0]:
pipeline = Pipeline([    
    ('clf', BernoulliNB()),
])

pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

In [93]:

def test_model(X_test,y_test, model):
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_pred, y_test)
    report = classification_report(y_pred, y_test)
    matrix = pd.DataFrame(confusion_matrix(y_test, y_pred),
                columns=['p_BEFORE', 'p_AFTER','p_SIMU'], index=[ 'BEFORE', 'AFTER','SIMU'])
    return accuracy, report, matrix

accuracy, report, matrix = test_model(X_test, y_test, pipeline)
print("the accuracy of the model is {accuracy:.{digits}f}".format(accuracy=accuracy, digits=2))
print()
print(report)
print()
print(matrix)

the accuracy of the model is 0.55

              precision    recall  f1-score   support

         0.0       0.96      0.56      0.70       387
         1.0       0.00      0.00      0.00         0
         2.0       0.19      0.50      0.27        34

    accuracy                           0.55       421
   macro avg       0.38      0.35      0.33       421
weighted avg       0.89      0.55      0.67       421


        p_BEFORE  p_AFTER  p_SIMU
BEFORE       215        0      10
AFTER         99        0       7
SIMU          73        0      17


  _warn_prf(average, modifier, msg_start, len(result))


SMOTE

In [135]:
pipeline = Pipeline([    

     ('clf', BernoulliNB()),
])

smo = SMOTE(sampling_strategy={1.0:600,2.0:600 })
# smo = SMOTE()
X_smo, y_smo = smo.fit_sample(X_train, y_train)

from collections import Counter
print('dataset shape {}'.format(Counter(y_smo)))
pipeline.fit(X_smo, y_smo)
y_pred = pipeline.predict(X_test)

def test_model(X_test,y_test, model):
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_pred, y_test)
    report = classification_report(y_pred, y_test)
    matrix = pd.DataFrame(confusion_matrix(y_test, y_pred),
                columns=['p_BEFORE', 'p_AFTER','p_SIMU'], index=['BEFORE', 'AFTER','SIMU'])
    return accuracy, report, matrix

accuracy, report, matrix = test_model(X_test, y_test, pipeline)
print("the accuracy of the model is {accuracy:.{digits}f}".format(accuracy=accuracy, digits=2))
print()
print(report)
print()
print(matrix)

dataset shape Counter({0.0: 861, 1.0: 600, 2.0: 600})
the accuracy of the model is 0.53

              precision    recall  f1-score   support

         0.0       0.91      0.55      0.68       372
         1.0       0.01      0.50      0.02         2
         2.0       0.21      0.40      0.28        47

    accuracy                           0.53       421
   macro avg       0.38      0.48      0.33       421
weighted avg       0.82      0.53      0.63       421


        p_BEFORE  p_AFTER  p_SIMU
BEFORE       204        1      20
AFTER         97        1       8
SIMU          71        0      19




# Logistic Regression

In [0]:
pipeline = Pipeline([    

  ('clf', LogisticRegression(
      solver='newton-cg',  
      multi_class='multinomial', 
      )),
])


pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

In [95]:

def test_model(X_test,y_test, model):
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_pred, y_test)
    report = classification_report(y_pred, y_test)
    matrix = pd.DataFrame(confusion_matrix(y_test, y_pred),
                columns=['p_BEFORE', 'p_AFTER','p_SIMU'], index=[ 'BEFORE', 'AFTER','SIMU'])
    return accuracy, report, matrix

accuracy, report, matrix = test_model(X_test, y_test, pipeline)
print("the accuracy of the model is {accuracy:.{digits}f}".format(accuracy=accuracy, digits=2))
print()
print(report)
print()
print(matrix)

the accuracy of the model is 0.55

              precision    recall  f1-score   support

         0.0       0.96      0.55      0.70       389
         1.0       0.02      0.50      0.04         4
         2.0       0.17      0.54      0.25        28

    accuracy                           0.55       421
   macro avg       0.38      0.53      0.33       421
weighted avg       0.89      0.55      0.66       421


        p_BEFORE  p_AFTER  p_SIMU
BEFORE       215        2       8
AFTER         99        2       5
SIMU          75        0      15


## SMOTE

In [136]:
pipeline = Pipeline([    

  ('clf', LogisticRegression(
      solver='newton-cg',  
      multi_class='multinomial', 
      )),
])

# smo = SMOTE(sampling_strategy={1.0:600,2.0:600 })
smo = SMOTE()
X_smo, y_smo = smo.fit_sample(X_train, y_train)

from collections import Counter
print('dataset shape {}'.format(Counter(y_smo)))
pipeline.fit(X_smo, y_smo)
y_pred = pipeline.predict(X_test)

def test_model(X_test,y_test, model):
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_pred, y_test)
    report = classification_report(y_pred, y_test)
    matrix = pd.DataFrame(confusion_matrix(y_test, y_pred),
                columns=[ 'p_BEFORE', 'p_AFTER','p_SIMU'], index=[ 'BEFORE', 'AFTER','SIMU'])
    return accuracy, report, matrix

accuracy, report, matrix = test_model(X_test, y_test, pipeline)
print("the accuracy of the model is {accuracy:.{digits}f}".format(accuracy=accuracy, digits=2))
print()
print(report)
print()
print(matrix)

dataset shape Counter({0.0: 861, 1.0: 861, 2.0: 861})
the accuracy of the model is 0.37

              precision    recall  f1-score   support

         0.0       0.25      0.61      0.36        94
         1.0       0.71      0.27      0.40       273
         2.0       0.24      0.41      0.31        54

    accuracy                           0.37       421
   macro avg       0.40      0.43      0.35       421
weighted avg       0.55      0.37      0.38       421


        p_BEFORE  p_AFTER  p_SIMU
BEFORE        57      144      24
AFTER         23       75       8
SIMU          14       54      22




# Random Forest

In [123]:
pipeline = Pipeline([    
  ('clf', RandomForestClassifier(n_estimators = 43,
                        min_samples_split = 20,
                        max_depth = 250))
])

from collections import Counter
print('dataset shape {}'.format(Counter(y_train)))
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

dataset shape Counter({0.0: 861, 1.0: 439, 2.0: 381})


In [124]:

def test_model(X_test,y_test, model):
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_pred, y_test)
    report = classification_report(y_pred, y_test)
    matrix = pd.DataFrame(confusion_matrix(y_test, y_pred),
                columns=[ 'p_BEFORE', 'p_AFTER','p_SIMU'], index=[ 'BEFORE', 'AFTER','SIMU'])
    return accuracy, report, matrix

accuracy, report, matrix = test_model(X_test, y_test, pipeline)
print("the accuracy of the model is {accuracy:.{digits}f}".format(accuracy=accuracy, digits=2))
print()
print(report)
print()
print(matrix)

the accuracy of the model is 0.54

              precision    recall  f1-score   support

         0.0       0.86      0.56      0.68       345
         1.0       0.11      0.33      0.17        36
         2.0       0.23      0.53      0.32        40

    accuracy                           0.54       421
   macro avg       0.40      0.47      0.39       421
weighted avg       0.73      0.54      0.60       421


        p_BEFORE  p_AFTER  p_SIMU
BEFORE       193       19      13
AFTER         88       12       6
SIMU          64        5      21


## Random Forest SMOTE

In [137]:
pipeline = Pipeline([    
  ('clf', RandomForestClassifier(n_estimators = 43,
                        min_samples_split = 2,
                        max_depth = 500))
])

# smo = SMOTE({1.0:600,2.0:600 })
smo = SMOTE()
X_smo, y_smo = smo.fit_sample(X_train, y_train)

from collections import Counter
print('dataset shape {}'.format(Counter(y_smo)))
pipeline.fit(X_smo, y_smo)
y_pred = pipeline.predict(X_test)


def test_model(X_test,y_test, model):
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_pred, y_test)
    report = classification_report(y_pred, y_test)
    matrix = pd.DataFrame(confusion_matrix(y_test, y_pred),
                columns=['p_BEFORE', 'p_AFTER','p_SIMU'], index=[ 'BEFORE', 'AFTER','SIMU'])
    return accuracy, report, matrix

accuracy, report, matrix = test_model(X_test, y_test, pipeline)
print("the accuracy of the model is {accuracy:.{digits}f}".format(accuracy=accuracy, digits=2))
print()
print(report)
print()
print(matrix)

dataset shape Counter({0.0: 861, 1.0: 861, 2.0: 861})
the accuracy of the model is 0.43

              precision    recall  f1-score   support

         0.0       0.48      0.61      0.54       175
         1.0       0.38      0.31      0.34       130
         2.0       0.39      0.30      0.34       116

    accuracy                           0.43       421
   macro avg       0.41      0.41      0.40       421
weighted avg       0.42      0.43      0.42       421


        p_BEFORE  p_AFTER  p_SIMU
BEFORE       107       60      58
AFTER         43       40      23
SIMU          25       30      35




## GradientBoosting

In [0]:
from sklearn.ensemble import GradientBoostingClassifier

In [130]:
pipeline = Pipeline([    
  ('clf', GradientBoostingClassifier())
])

smo = SMOTE({1.0:600,2.0:600 })
X_smo, y_smo = smo.fit_sample(X_train, y_train)

from collections import Counter
print('dataset shape {}'.format(Counter(y_train)))
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

def test_model(X_test,y_test, model):
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_pred, y_test)
    report = classification_report(y_pred, y_test)
    matrix = pd.DataFrame(confusion_matrix(y_test, y_pred),
                columns=[ 'p_BEFORE', 'p_AFTER','p_SIMU'], index=[ 'BEFORE', 'AFTER','SIMU'])
    return accuracy, report, matrix

accuracy, report, matrix = test_model(X_test, y_test, pipeline)
print("the accuracy of the model is {accuracy:.{digits}f}".format(accuracy=accuracy, digits=2))
print()
print(report)
print()
print(matrix)



dataset shape Counter({0.0: 861, 1.0: 439, 2.0: 381})
the accuracy of the model is 0.55

              precision    recall  f1-score   support

         0.0       0.93      0.56      0.70       373
         1.0       0.08      0.44      0.13        18
         2.0       0.17      0.50      0.25        30

    accuracy                           0.55       421
   macro avg       0.39      0.50      0.36       421
weighted avg       0.84      0.55      0.64       421


        p_BEFORE  p_AFTER  p_SIMU
BEFORE       209        5      11
AFTER         94        8       4
SIMU          70        5      15


 ## GradientBoosting SMOTE

In [138]:
pipeline = Pipeline([    
  ('clf', GradientBoostingClassifier())
])

# smo = SMOTE({1.0:600,2.0:600 })
smo = SMOTE()
X_smo, y_smo = smo.fit_sample(X_train, y_train)

from collections import Counter
print('dataset shape {}'.format(Counter(y_train)))
print('dataset shape {}'.format(Counter(y_smo)))
pipeline.fit(X_smo, y_smo)
y_pred = pipeline.predict(X_test)

def test_model(X_test,y_test, model):
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_pred, y_test)
    report = classification_report(y_pred, y_test)
    matrix = pd.DataFrame(confusion_matrix(y_test, y_pred),
                columns=['p_BEFORE', 'p_AFTER','p_SIMU'], index=[ 'BEFORE', 'AFTER','SIMU'])
    return accuracy, report, matrix

accuracy, report, matrix = test_model(X_test, y_test, pipeline)
print("the accuracy of the model is {accuracy:.{digits}f}".format(accuracy=accuracy, digits=2))
print()
print(report)
print()
print(matrix)



dataset shape Counter({0.0: 861, 1.0: 439, 2.0: 381})
dataset shape Counter({0.0: 861, 1.0: 861, 2.0: 861})
the accuracy of the model is 0.42

              precision    recall  f1-score   support

         0.0       0.40      0.63      0.49       142
         1.0       0.55      0.30      0.38       196
         2.0       0.32      0.35      0.34        83

    accuracy                           0.42       421
   macro avg       0.42      0.43      0.40       421
weighted avg       0.45      0.42      0.41       421


        p_BEFORE  p_AFTER  p_SIMU
BEFORE        90       97      38
AFTER         32       58      16
SIMU          20       41      29
