In [1]:
from os.path import dirname, join

In [3]:
import pandas as pd
from sklearn.linear_model import RidgeClassifier
from sklearn.metrics import classification_report

In [4]:
from simpletransformers.language_representation import RepresentationModel


In [12]:
train_df = pd.read_csv("data/train.csv", header=None)
train_df.head()

Unnamed: 0,0,1
0,1,"Unfortunately, the frustration of being Dr. Go..."
1,2,Been going to Dr. Goldberg for over 10 years. ...
2,1,I don't know what Dr. Goldberg was like before...
3,1,I'm writing this review to give you a heads up...
4,2,All the food is great here. But the best thing...


In [13]:
eval_df = pd.read_csv("data/test.csv", header=None)
eval_df.head()

Unnamed: 0,0,1
0,2,"Contrary to other reviews, I have zero complai..."
1,1,Last summer I had an appointment to get new ti...
2,2,"Friendly staff, same starbucks fair you get an..."
3,1,The food is good. Unfortunately the service is...
4,2,Even when we didn't have a car Filene's Baseme...


In [14]:
# Under collumn '0', negative polarity is class 1, and positive class 2.
# Re-assign collumn '0' data to be either 1 or 0, based on the boolean of whether it equals to 2, for 'labeling':
train_df[0] = (train_df[0] == 2).astype(int)
eval_df[0] = (eval_df[0] == 2).astype(int)

In [15]:
eval_df.head()

Unnamed: 0,0,1
0,1,"Contrary to other reviews, I have zero complai..."
1,0,Last summer I had an appointment to get new ti...
2,1,"Friendly staff, same starbucks fair you get an..."
3,0,The food is good. Unfortunately the service is...
4,1,Even when we didn't have a car Filene's Baseme...


In [19]:
train_df.head()

Unnamed: 0,0,1
0,0,"Unfortunately, the frustration of being Dr. Go..."
1,1,Been going to Dr. Goldberg for over 10 years. ...
2,0,I don't know what Dr. Goldberg was like before...
3,0,I'm writing this review to give you a heads up...
4,1,All the food is great here. But the best thing...


In [25]:
# text data cleaning, and renaming of the collumns and get a portion of it so it will not take much time to run:
training_df = pd.DataFrame({"text": train_df[1].replace(r"\n", " ", regex=True), "labels": train_df[0]})[:1000]
print(training_df.head())

                                                text  labels
0  Unfortunately, the frustration of being Dr. Go...       0
1  Been going to Dr. Goldberg for over 10 years. ...       1
2  I don't know what Dr. Goldberg was like before...       0
3  I'm writing this review to give you a heads up...       0
4  All the food is great here. But the best thing...       1


In [23]:
# same with eval df:
evaluation_df = pd.DataFrame({"text": eval_df[1].replace(r"\n", " ", regex=True), "labels": eval_df[0]})[:100]
print(evaluation_df.head())

                                                text  labels
0  Contrary to other reviews, I have zero complai...       1
1  Last summer I had an appointment to get new ti...       0
2  Friendly staff, same starbucks fair you get an...       1
3  The food is good. Unfortunately the service is...       0
4  Even when we didn't have a car Filene's Baseme...       1


In [31]:
# choose a model to set up:
model_type_list = ['bert', 'roberta', 'gpt2']
model_name_list = ['bert-base-uncased', 'roberta-base', 'gpt2']
# initiate a representation model
model = RepresentationModel(
    model_type = model_type_list[1],
    model_name = model_name_list[1],
    use_cuda=False,
    args={"no_save": True, "reprocess_input_data": True, "overwrite_output_dir": True},
)

layer.0.attention.self.value.weight', 'roberta.encoder.layer.0.attention.self.value.bias', 'roberta.encoder.layer.0.attention.output.dense.weight', 'roberta.encoder.layer.0.attention.output.dense.bias', 'roberta.encoder.layer.0.attention.output.LayerNorm.weight', 'roberta.encoder.layer.0.attention.output.LayerNorm.bias', 'roberta.encoder.layer.0.intermediate.dense.weight', 'roberta.encoder.layer.0.intermediate.dense.bias', 'roberta.encoder.layer.0.output.dense.weight', 'roberta.encoder.layer.0.output.dense.bias', 'roberta.encoder.layer.0.output.LayerNorm.weight', 'roberta.encoder.layer.0.output.LayerNorm.bias', 'roberta.encoder.layer.1.attention.self.query.weight', 'roberta.encoder.layer.1.attention.self.query.bias', 'roberta.encoder.layer.1.attention.self.key.weight', 'roberta.encoder.layer.1.attention.self.key.bias', 'roberta.encoder.layer.1.attention.self.value.weight', 'roberta.encoder.layer.1.attention.self.value.bias', 'roberta.encoder.layer.1.attention.output.dense.weight', 'rob

In [32]:
# vectorize the content from df
train_vectors = model.encode_sentences(training_df["text"].to_list(), combine_strategy="mean")

In [33]:
eval_vectors = model.encode_sentences(evaluation_df["text"].to_list(), combine_strategy="mean")


In [34]:
# using trained model (with train_vectors) to predict eval_vector
clf_model = RidgeClassifier()
clf_model.fit(train_vectors, training_df["labels"])
predictions = clf_model.predict(eval_vectors)

In [35]:
# then reveal the original labels on the test data and generate a report
print(classification_report(evaluation_df["labels"], predictions))

              precision    recall  f1-score   support

           0       0.67      0.79      0.72        48
           1       0.77      0.63      0.69        52

    accuracy                           0.71       100
   macro avg       0.72      0.71      0.71       100
weighted avg       0.72      0.71      0.71       100



Cases that are considered 'correct':  
True Positive: the case was positive and predicted positive.  
True Negative: the case was negative and predicted negative.  

Cases that are considered 'wrong':  
False Positive (the guess): the case was negative but predicted positive.  
False Negative (the guess): the case was positive but predicted negative.  