-
Notifications
You must be signed in to change notification settings - Fork 4
/
1_Traindata_preparation.py
executable file
·146 lines (127 loc) · 5.28 KB
/
1_Traindata_preparation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
"""
Created on Jul2017
My Implementation of the paper: End-to-End Multi-View Networks for Text Classification
@author: Yousof Erfani
"""
import glove
from glove import Glove
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sn
import tensorflow as tf
import sklearn
from nltk.corpus import stopwords
import os
import pdb
import re
# folder addresses
train_neg_Address = 'dataset/train_neg'
train_pos_Address = 'dataset/train_pos'
# Parameters
embedding_length = 50
n_examples = 1875
def read_file(addr):
lines = []
for file_name in os.listdir(addr):
#print file_name
textfile = open (addr+'/'+file_name,'r')
lines.append(textfile.readlines())
textfile.close()
return lines
train_neg_lines = read_file(train_neg_Address)
train_pos_lines = read_file(train_pos_Address)
# load gloave model
def loadGloveModel(gloveFile):
print ("Loading Glove Model")
f = open(gloveFile,'r')
model = {}
leng = len(f.readlines())
f.close()
f = open(gloveFile,'r')
cc = 0
for line in f:
cc = cc+1
if cc % int(leng/100)==0:
print(str(int(100*cc/leng))+'%')
splitLine = line.split()
word = splitLine[0]
embedding = [float(val) for val in splitLine[1:]]
model[word] = embedding
print ("Done.",len(model)," words loaded!")
return model
model = loadGloveModel ('glove.6B/' + 'glove.6B.50d.txt')
def find_sentence_length(text):
sentence_lengths = []
for line in text:
sentence_lengths.append( len(line[0].split()) )
return sentence_lengths
def find_embeddings(text, median_length):
embeddings = []
global model
word_counts = pd.DataFrame({'word_name':['the'],'count':[1]})
stop_words = stopwords.words('english')
embeddings = np.zeros((len(text),median_length,embedding_length ), dtype = 'float64')
for line_no, line in enumerate(text):
print(line_no)
cnt=0
for word_no, word in enumerate(line[0].split()):
if cnt < median_length:
word = re.sub(r"['.:;!]",'',word)
word = re.sub(r"<br",'',word)
word = re.sub(r"[-)()\/*\#$\%\&\+\=\?\[\]\@\_]",' ',word)
word = re.sub(r" ",' ',word)
word = re.sub(r'[",>]',' ',word)
word = re.sub(r'[0-9]','',word)
if word.endswith('.') | word.endswith(',')|word.endswith('?')|word.endswith('!')|word.endswith(':')|word.endswith('"')|word.endswith(')'):
word = word[:-1]
if word.endswith("'s")|word.endswith("!!")|word.endswith("??")|word.endswith("?!") :
word = word[:-2]
if word.endswith("n't") :
word = word[:-3]
if word.endswith(".<br") :
word = word[:-4]
if word.startswith('/>'):
word = word[2:]
if word.startswith('"')|word.startswith('('):
word = word[1:]
try:
w_unicode = np.unicode(word)
except:
print ('----Non Unocode----', word)
try:
for subword in word.split():
if subword not in set(stopwords.words('english')):# and subword.lower() in model.keys():
embeddings[line_no,cnt,:] = model[subword.lower()]
new_row = pd.DataFrame({'word_name':[subword.lower()],'count':[1]})
word_counts = word_counts.append(new_row)
cnt += 1
except:
print('NOt found for ',word.lower())
word_counts_grouped=word_counts.groupby(word_counts.word_name).sum()
return embeddings, word_counts_grouped
# dealing with the vairable length sentence input
setnece_length_neg= find_sentence_length(train_neg_lines)
sentence_length_pos= find_sentence_length(train_pos_lines)
setnece_length_neg.extend(sentence_length_pos)
med_sentence_leng = int(np.median (setnece_length_neg))
print('median_sentence_leng_train:', med_sentence_leng )
embedding_train_neg , wcount_neg = find_embeddings(train_neg_lines[:n_examples],med_sentence_leng)
wcount_neg = wcount_neg.reset_index()
print ("Finished Computing the embeddings for negative documents")
embedding_train_pos, w_count_pos = find_embeddings(train_pos_lines[:n_examples],med_sentence_leng)
print ("Finished Computing the embeddings for positive documents")
# find one hot encoding for labels
labels = np.concatenate((np.ones(embedding_train_pos.shape[0]),np.zeros(embedding_train_neg.shape[0])))
labels = np.reshape (labels,(-1,1))
labels = np.concatenate((labels, 1-labels),axis =1)
train_data = np.concatenate((embedding_train_pos,embedding_train_neg ),axis =0 )
random_indices = np.random.permutation(train_data.shape[0])
train_data = train_data[random_indices,:,:]
labels = labels [random_indices]
del model
del train_pos_lines
del train_neg_lines
np.save('train_data_50.npy', train_data)
np.save('labels_50.npy', labels)
print('Finished')