In [2]:
import numpy as np
import pandas as pd
import re
from keras.preprocessing.text import Tokenizer
from sklearn.utils import shuffle
from matplotlib import pyplot as plt
from keras.models import Sequential, load_model
from keras.layers import LSTM, Dense, Dropout, Embedding, Masking, Bidirectional
from tensorflow.keras.optimizers import Adam
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import plot_model
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
lem = WordNetLemmatizer()
RANDOM_STATE = 50
UNK_ID = 1
PAD_ID = 0
MAX_LEN = 20

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/vaibhav/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/vaibhav/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/vaibhav/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /home/vaibhav/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
def clean_string(s):    
    s =  re.sub(r'(?<=[^\s0-9])(?=[.,;?])', r' ', s)
    s = re.sub(r'\((\d+)\)', r'', s)
    s = re.sub(r'\s\s', ' ', s)
    s = re.sub(r"[^A-Za-z0-9(),!?\'`]", " ", s)
    s = re.sub(r"\'s", " \'s", s)
    s = re.sub(r"\'ve", " \'ve", s)
    s = re.sub(r"n\'t", " n\'t", s)
    s = re.sub(r"\'re", " \'re", s)
    s = re.sub(r"\'d", " \'d", s)
    s = re.sub(r"\'ll", " \'ll", s)
    s = re.sub(r",", " , ", s)
    s = re.sub(r"!", " ! ", s)
    s = re.sub(r"\"", " \" ", s)
    s = re.sub(r"\(", " ( ", s)
    s = re.sub(r"\)", " ) ", s)
    s = re.sub(r"\?", " ? ", s)
    s = re.sub(r"\s{2,}", " ", s)
    s = re.sub(r"\.", " . ", s)
    s = re.sub(r"., ", " , ", s)
    s = re.sub(r"\\n", " ", s)
    return s.strip().lower()

def create_train_valid(features,labels,train_fraction = 0.9,max_valid=70):

    # features,labels = shuffle(features,labels,random_state = RANDOM_STATE)

    train_end = max(int(train_fraction*len(labels)),len(labels)-max_valid)

    train_features = np.asarray(features[:train_end])
    valid_features = np.asarray(features[train_end:])

    train_labels = np.asarray(labels[:train_end])
    valid_labels = np.asarray(labels[train_end:])
    
    return train_features,valid_features,train_labels,valid_labels

In [4]:
data = pd.read_csv('combo.csv').dropna(axis=0,how='any')

features = data['text'].to_list()
labels = data['label'].to_list()

X_train, X_valid, y_train, y_valid = create_train_valid(features,labels)
training_dict = {'X_train': X_train, 'X_valid': X_valid,'y_train': y_train, 'y_valid': y_valid}



In [5]:
glove_loc = '../glove.6B.100d.txt'
glove = np.loadtxt(glove_loc,dtype='str',comments=None)

vectors = glove[:,1:].astype('float')
words = glove[:,0]

del glove

In [6]:
vocab = dict()

for sentences in features:
    sentences = clean_string(sentences)
    sentences = nltk.word_tokenize(sentences)
    for word in sentences:
        vocab[word] = 1

In [7]:
word_lookup = {word:vector for word,vector in zip(words,vectors)}

word_index = dict()
ind = 2
not_found = 0

embeds = dict()

for i,word in enumerate(vocab.keys()):
    vector = word_lookup.get(word,None)

    if vector is not None:
        word_index[word] = ind
        embeds[ind] = np.copy(vector)
        ind+=1
    else:
        not_found +=1
        word_index[word] = UNK_ID

print(f'{not_found} words not found.')

2730 words not found.


In [8]:
def word_to_index(utt1):
    utt2 = [word_index.get(word,UNK_ID) for word in nltk.word_tokenize(clean_string(utt1))]
    utt3 = utt2[:MAX_LEN]
    utt4 = utt3 + [PAD_ID]*(MAX_LEN - len(utt3))
    utt5 = np.mean([embeds[i] for i in utt4 if i>1],axis=0)
    if type(utt5) == np.float64:
        utt5 = np.random.rand(100)
    if type(utt5) == np.float64:
        print('here')
    return utt5


In [9]:
training_dict['X_train'] = [word_to_index(word) for word in training_dict['X_train']]
training_dict['X_valid'] = [word_to_index(word) for word in training_dict['X_valid']]

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


In [10]:
from sklearn import svm
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import FunctionTransformer, StandardScaler

In [11]:
def svm_train(features,labels):
    clf = make_pipeline(
        StandardScaler(),
        svm.SVC(C=10.0, gamma="scale", kernel="rbf")
    )
    return clf.fit(features, labels)

In [12]:
def svm_test(clf,features,labels):
    pred = clf.predict(features)
    true = labels

    result_string = classification_report(true, pred, digits=3)
    print(confusion_matrix(true, pred))
    print(result_string)

In [13]:
clf = svm_train(training_dict['X_train'],training_dict['y_train'])

svm_test(clf,training_dict['X_valid'],training_dict['y_valid'])

[[ 3  1]
 [40 26]]
              precision    recall  f1-score   support

           0      0.070     0.750     0.128         4
           1      0.963     0.394     0.559        66

    accuracy                          0.414        70
   macro avg      0.516     0.572     0.343        70
weighted avg      0.912     0.414     0.534        70



In [14]:
print(len(training_dict['X_train'][0]))

100


In [15]:
mi = 100000
ma = -1
for i in range(len(training_dict['X_train'])):
    # print(type(training_dict['X_train'][i]))
    # break
    # print(type(training_dict['X_train'][i]))
    if type(training_dict['X_train'][i]) == np.float64:
        print(training_dict['X_train'][i])
        print(i)
        break
    if len(training_dict['X_train'][i]) < mi:
        mi = len(training_dict['X_train'][i])
    ma = max(ma,len(training_dict['X_train'][i]))

print(mi)
print(ma)

100
100


In [16]:
temp = np.concatenate([[[1,2]],[[3,4]]],axis=0)
print(temp)

[[1 2]
 [3 4]]
