-
Notifications
You must be signed in to change notification settings - Fork 4
/
avg_words_tfidf.py
121 lines (81 loc) · 2.89 KB
/
avg_words_tfidf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import pandas as pd
import numpy as np
import os
import pickle
from collections import defaultdict
from gensim.models import Word2Vec as w2v
from evaluation import evaluate
def characterVec(words, model, num_features):
featureVec = np.zeros((num_features,), dtype="float32")
nwords = 0.
for word in words:
nwords += 1.
featureVec = np.add(featureVec, model[word])
featureVec = np.divide(featureVec, nwords)
return featureVec
# One of the kaggle tests
def makeFeatureVec(words, model, tfidf_model, num_features):
# Pre-initialize an empty numpy array (for speed)
featureVec = np.zeros((num_features,),dtype="float32")
# Count number of words
nwords = 0.
# Loop over word by word
# If in word2vec vocabulary, add its feature vector to the total
for word in words.split():
if word in model: #and word not in stop_words:
nwords += 1.
# Transform the word feature
wordFeature = np.multiply(model[word], tfidf_model[word])
featureVec = np.add(featureVec,wordFeature)
# Divide the result by the number of words to get the average
featureVec = np.divide(featureVec,nwords)
# If number of words zero
if nwords == 0:
featureVec = characterVec(words, model, num_features)
return featureVec
# One of the kaggle tests
def getAvgFeatureVecs(comments, model, tfidf_model, num_features):
# Initialize empty counter
counter = 0
# Preallocate a 2D numpy array for speed
reviewFeatureVecs = np.zeros((len(comments),num_features),dtype="float32")
for comment in comments:
# Call function that gets the average vectors
reviewFeatureVecs[counter] = makeFeatureVec(comment, model, tfidf_model, num_features)
# Increment counter
counter += 1
return reviewFeatureVecs
# Get TFIDF Model and Transform it
# Motivation: http://nadbordrozd.github.io/blog/2016/05/20/text-classification-with-word2vec/
def getTFIDIF(FILE):
tfidf_model = None
# Load it
tfidf_model = pickle.load(open(FILE,"rb"))
# Get max idf value
max_idf = max(tfidf_model.idf_)
# Transform the model into a dictionary
tfidf_model = defaultdict(lambda: max_idf, [(w, tfidf_model.idf_[i])
for w, i in tfidf_model.vocabulary_.items()])
return tfidf_model
os.system('cls')
# Load Word2Vec model here
print("LOADING WORD2VEC MODEL \n\n")
FILE = "W2V Models/w2v_reddit_unigram_300d.bin"
model = w2v.load_word2vec_format(FILE, binary=True)
# Load TF-IDF Model Here
print("LOADING TFIDF DICTIONARY \n\n")
FILE = "TFIDF models/tfidf_stop.pk"
tfidf_model = getTFIDIF(FILE)
# Load the dataset here
print("LOADING DATASET \n\n")
df = pd.read_csv('balanced_dataset.csv')
# Separate out comments and labels
X , y = df['Comment'], df['Insult']
# Data Transformation
print("TRANSFORMING DATA \n\n")
X = getAvgFeatureVecs(X, model, tfidf_model, 300)
# Get the Python's file name. Remove the .py extension
file_name = os.path.basename(__file__)
file_name = file_name.replace(".py","")
# Evaluate models
evaluate(X,y, file_name)