-
Notifications
You must be signed in to change notification settings - Fork 0
/
WSD.py
152 lines (135 loc) · 7.2 KB
/
WSD.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
import re
import pandas as pd
import joblib
import numpy as np
from nltk.stem import WordNetLemmatizer
from sentence_transformers import SentenceTransformer
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
class Prediction:
def __init__(self, input1, input2, ambiguous_word, word_embedding):
self.word_embeddings = word_embedding
self.word = ambiguous_word
self.sentence1 = input1
self.sentence2 = input2
def preprocess_inputs(self):
# Convert to lowercase
self.sentence1 = self.sentence1.lower()
self.sentence2 = self.sentence2.lower()
# Remove special characters from strings
sentence1 = re.sub(r"[^a-zA-Z0-9]", " ", self.sentence1)
sentence2 = re.sub(r"[^a-zA-Z0-9]", " ", self.sentence2)
# Lemmatize the sentences
wnl = WordNetLemmatizer()
lemma_list = []
for words in sentence1:
lemma_list.append(wnl.lemmatize(words))
lemmatized_sentence1 = " ".join(str(e) for e in lemma_list)
lemma_list = []
for words in sentence2:
lemma_list.append(wnl.lemmatize(words))
lemmatized_sentence2 = " ".join(str(e) for e in lemma_list)
return lemmatized_sentence1, lemmatized_sentence2
def get_sentence_embeddings(self):
sentence1, sentence2 = self.preprocess_inputs()
model = SentenceTransformer("all-roberta-large-v1")
sentence1_encoding = model.encode(sentence1, show_progress_bar=False)
sentence2_encoding = model.encode(sentence2, show_progress_bar=False)
word_vector = self.word_embeddings[self.word]
return sentence1_encoding, sentence2_encoding, word_vector
def evaluation(self):
eval_data = pd.read_csv("Dev.csv")
eval_data["Label"] = eval_data["Label"].map({"T": 1,
"F": 0})
y_test = eval_data["Label"].values
ambiguous_word = eval_data["Ambigous_word"].values
word_vectors = []
for word in ambiguous_word:
try:
word_vectors.append(self.word_embeddings[word])
except KeyError:
word_vectors.append(np.zeros(50))
lemmatized_sentence1 = eval_data["Lemmatized_Sentence1"]
lemmatized_sentence2 = eval_data["Lemmatized_Sentence2"]
model = SentenceTransformer("all-roberta-large-v1")
sentence1_encoding = model.encode(lemmatized_sentence1, show_progress_bar=True)
sentence2_encoding = model.encode(lemmatized_sentence2, show_progress_bar=True)
test_data = np.hstack((word_vectors, sentence1_encoding, sentence2_encoding))
rfc = joblib.load("Data/RandomForestClassifier_all-roberta-large-v1")
etc = joblib.load("Data/ExtraTreesClassifier_all-roberta-large-v1")
svc = joblib.load("Data/SupportVectorClassifier_all-roberta-large-v1")
rfc_prediction = rfc.predict(test_data)
etc_prediction = etc.predict(test_data)
svc_prediction = svc.predict(test_data)
# Accuracy results
print("Results by Random Forest Classifier:")
print("\t Accuracy score = {:.2f}%".format(accuracy_score(y_test, rfc_prediction) * 100))
print("\t Precision score = {:.2f}%".format(precision_score(y_test, rfc_prediction) * 100))
print("\t Recall score = {:.2f}%".format(recall_score(y_test, rfc_prediction) * 100))
print("\t F1 Score = {:.2f}%".format(f1_score(y_test, rfc_prediction) * 100))
print("\nResults by Extra Trees Classifier:")
print("\t Accuracy score = {:.2f}%".format(accuracy_score(y_test, etc_prediction) * 100))
print("\t Precision score = {:.2f}%".format(precision_score(y_test, etc_prediction) * 100))
print("\t Recall score = {:.2f}%".format(recall_score(y_test, etc_prediction) * 100))
print("\t F1 Score = {:.2f}%".format(f1_score(y_test, etc_prediction) * 100))
print("\nResults by Support Vector Classifier:")
print("\t Accuracy score = {:.2f}%".format(accuracy_score(y_test, svc_prediction) * 100))
print("\t Precision score = {:.2f}%".format(precision_score(y_test, svc_prediction) * 100))
print("\t Recall score = {:.2f}%".format(recall_score(y_test, svc_prediction) * 100))
print("\t F1 Score = {:.2f}%".format(f1_score(y_test, svc_prediction) * 100))
def predict(self):
# Load the model
rfc = joblib.load("Data/RandomForestClassifier_all-roberta-large-v1")
etc = joblib.load("Data/ExtraTreesClassifier_all-roberta-large-v1")
svc = joblib.load("Data/SupportVectorClassifier_all-roberta-large-v1")
# Prepare the data
sentence1, sentence2, word = self.get_sentence_embeddings()
inputs = np.hstack((word, sentence1, sentence2))
rfc_prediction = rfc.predict(inputs.reshape(1, -1))
etc_prediction = etc.predict(inputs.reshape(1, -1))
svc_prediction = svc.predict(inputs.reshape(1, -1))
# predictions_list = [rfc_prediction[0], etc_prediction[0], svc_prediction[0]]
# prediction_one = 0
# prediction_zero = 0
# for value in predictions_list:
# if value == 1:
# prediction_one = prediction_one + 1
# else:
# prediction_zero = prediction_zero + 1
# prediction = 1 if prediction_one>prediction_zero else 0
# print()
print("\nPrediction by Random Forest Classifier:")
if rfc_prediction[0] == 1:
print(f"The word '{self.word}' is ambiguous. It has a different meaning in both the sentences.")
else:
print(
f"The word '{self.word}' is not ambiguous, it has the same meaning in both the sentences."
f' Thus, it is "domain independent".')
print("\nPrediction by Extra Trees Classifier:")
if etc_prediction[0] == 1:
print(f"The word '{self.word}' is ambiguous. It has a different meaning in both the sentences.")
else:
print(
f"The word '{self.word}' is not ambiguous, it has the same meaning in both the sentences."
f' Thus, it is "domain independent".')
print("\nPrediction by Support Vector Classifier:")
if svc_prediction[0] == 1:
print(f"The word '{self.word}' is ambiguous. It has a different meaning in both the sentences.")
else:
print(
f"The word '{self.word}' is not ambiguous, it has the same meaning in both the sentences."
f' Thus, it is "domain independent".')
if __name__ == "__main__":
sentence1 = "He walked into a coal mine."
sentence2 = "He stepped on a land mine."
ambiguous_word = "mine"
word_embeddings = {}
with open("Glove/glove.6B.50d.txt", 'r', encoding="utf-8") as f:
for line in f:
values = line.split()
word = values[0]
vector = np.asarray(values[1:], 'float32')
word_embeddings[word] = vector
wsd = Prediction(sentence1, sentence2, ambiguous_word, word_embeddings)
wsd.predict()
print("\n")
# wsd.evaluation()