In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np

from nltk import FreqDist
from collections import Counter
from tqdm import tqdm

from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

from keras.models import Sequential
from keras import layers
from keras import regularizers
from keras import backend as K
from keras.callbacks import ModelCheckpoint

2022-11-28 16:51:05.838385: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
tqdm.pandas()

In [3]:
input_path = 'data/input/'

In [4]:
data = pd.read_csv(input_path + 'train_data_prepped.csv').fillna('')
# test_data = pd.read_csv(input_path + 'test_data_prepped.csv').fillna('')

In [5]:
target = pd.read_csv(input_path + 'train_results.csv')
target['target'].value_counts()

positive    520436
negative    519803
neutral         84
Name: target, dtype: int64

In [6]:
target['target'] = [0 if t == 'negative' else 2 if t == 'positive' else 1 for t in target['target'].values]

In [7]:
data['target'] = target['target']

In [8]:
data = data.sample(len(data)).reset_index(drop=True)

In [9]:
data.head()

Unnamed: 0,id,text,cat,text_no_punc,text_no_numerals,text_no_sw,text_porter_stemmed,text_lancaster_stemmed,text_lemmatized,target
0,1002697,4 days before i leave manila!,train,4 days before i leave manila!,days before i leave manila!,days leave manila!,day leav manila!,day leav manila!,day leave manila!,0
1,14839,mmmm twitter online... background changed! i s...,train,mmmm twitter online background changed! i shou...,mmmm twitter online background changed! i shou...,mmmm twitter online background changed! start ...,mmmm twitter onlin background changed! start e...,mmmm twit onlin background changed! start expa...,mmmm twitter online background changed! start ...,2
2,461477,"1 heroic shattered halls run, 2 savagery formu...",train,1 heroic shattered halls run 2 savagery formul...,heroic shattered halls run savagery formula ...,heroic shattered halls run savagery formula dr...,heroic shatter hall run savageri formula drop ...,hero shat hal run savagery formul drop woot! s...,heroic shattered hall run savagery formula dro...,0
3,196536,net .. net .. net .. hmm. bad weather .. weir...,train,net net net hmm bad weather weird summer!,net net net hmm bad weather weird summer!,net net net hmm bad weather weird summer!,net net net hmm bad weather weird summer!,net net net hmm bad weath weird summer!,net net net hmm bad weather weird summer!,2
4,825375,@stephj0 why don't you marry it? just kiddi...,train,stephj0 why dont you marry it ? just kiddi...,stephj why dont you marry it ? just kiddin...,stephj marry ? kidding cant resist junior high...,stephj marri ? kid cant resist junior high hum...,stephs marry ? kid cant resist juny high hum s...,stephj marry ? kidding cant resist junior high...,2


In [10]:
data['num_chars'] = [len(text) for text in data['text']]

In [11]:
data['num_words'] = [len(text.split()) for text in data['text']]

In [12]:
data['num_chars'].describe()

count    1.040323e+06
mean     7.412928e+01
std      3.644499e+01
min      6.000000e+00
25%      4.400000e+01
50%      6.900000e+01
75%      1.040000e+02
max      3.690000e+02
Name: num_chars, dtype: float64

In [13]:
data['num_words'].describe()

count    1.040323e+06
mean     1.318478e+01
std      6.959487e+00
min      1.000000e+00
25%      7.000000e+00
50%      1.200000e+01
75%      1.900000e+01
max      6.400000e+01
Name: num_words, dtype: float64

In [14]:
# pos = data[data['target']==2]
# neg = data[data['target']==0]

In [15]:
# pos['num_chars'].describe()

In [16]:
# neg['num_chars'].describe()

In [17]:
# pos['num_words'].describe()

In [18]:
# neg['num_words'].describe()

In [19]:
# pos_corpus = ' '.join(list(pos['text_lemmatized'])).split()

# neg_corpus = ' '.join(list(neg['text_lemmatized'])).split()

In [20]:
# fdist_pos = FreqDist(pos_corpus)
# fdist_neg = FreqDist(neg_corpus)

In [21]:
# fdist_pos.most_common()[:10]

In [22]:
# fdist_neg.most_common()[:10]

idea for choosing words: find words with biggest discrepancy between pos class and neg class tf-idf<br>
if TF-IDFpos / TF-IDFneg is really high, the word is relatively important to the positive class<br>
steps:<br>
1. use tfidf vectorizer on train data
2. for each word in corpus, get average tfidf for pos class and for neg class
3. for each word, get ratio

In [23]:
# corpus = list(set(' '.join(list(data['text_porter_stemmed'])).split()))

In [24]:
# corpus[:10]

In [25]:
# len(corpus)

In [26]:
vectorizer = TfidfVectorizer(max_features = 10_000, ngram_range=(1,1))

In [27]:
X = vectorizer.fit_transform(data['text_porter_stemmed']).toarray()

In [28]:
pos_index = data[data['target']==2].index
neg_index = data[data['target']==0].index

In [None]:
pos_X = X[pos_index, :]


In [None]:
neg_X = X[neg_index, :]

In [None]:
pos_X.shape

In [None]:
neg_X.shape

In [None]:
data['target'].value_counts()

In [None]:
pos_tfidf_avg = np.mean(pos_X, axis=0)
neg_tfidf_avg = np.mean(neg_X, axis=0)

In [None]:
pos_tfidf_avg.shape

In [None]:
neg_tfidf_avg.shape

In [None]:
print(vectorizer.get_feature_names())

In [None]:

# labels = tf.keras.utils.to_categorical(labels, 3, dtype="float32")
features = data['text_lemmatized'].values


In [None]:
features

In [None]:
max_words=10000

In [None]:
#Tokenizing data and making them sequences
tokenizer = Tokenizer(num_words=max_words)

tokenizer.fit_on_texts(features)

X = tokenizer.texts_to_matrix(features, mode='tfidf')


In [None]:
labels = data['target'].values

In [None]:
# #Splitting the data
# X_train, X_test, y_train, y_test = train_test_split(features[:100_000], labels[:100_000], random_state=0)
# X_train = np.asarray(X_train)
# X_test = np.asarray(X_test)
# print (len(X_train),len(X_test),len(y_train),len(y_test))
# print(type(X_train), type(X_test), type(y_train), type(y_test))

# print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

In [None]:
clf = RandomForestClassifier()
clf.fit(X[:100_000], labels[:100_000])



In [None]:
importances = clf.feature_importances_
importances

In [None]:
feature_order = np.argsort(importances)[::-1]

feature_order.shape

In [None]:
feature_order


In [None]:
feature_names = tokenizer.word_index
feature_key = {index: word for word, index in feature_names.items()}
feature_key


In [None]:
# this gets top words by feature importance
top_words = []

for i in range(1, 1001):
    word_key = feature_order[i]
    word = feature_key[word_key]
    top_words.append(word)

In [None]:
top_words

In [None]:
pd.DataFrame(top_words).to_csv('top_words.csv', index=False)

In [None]:
len(top_words)

only keep top_words in BOW, then run NN

In [None]:
word_index = {w:i for i, w in enumerate(top_words)}
word_index

In [None]:
max_len=200

In [None]:
#Tokenizing data and making them sequences
# tokenizer = Tokenizer(num_words=1000)
# tokenizer.word_index = word_index
# tokenizer.fit_on_texts(X_raw)
# features = tokenizer.texts_to_matrix(X_raw, mode="tfidf")


In [None]:
features = data['text_lemmatized'].values
# vectorizer = TfidfVectorizer(vocabulary=word_index, max_features=len(top_words))
vectorizer = TfidfVectorizer(max_features=len(top_words))

BOW = vectorizer.fit_transform(features)
BOW_array = BOW.toarray()

In [None]:
BOW_array.shape

In [None]:

one_hot_labels = tf.keras.utils.to_categorical(labels, 3, dtype="float32")

In [None]:
labels

In [None]:
#Splitting the data
X_train, X_test, y_train, y_test = train_test_split(BOW_array[:], one_hot_labels[:], random_state=0)
X_train = np.asarray(X_train)
X_test = np.asarray(X_test)
print (len(X_train),len(X_test),len(y_train),len(y_test))
print(type(X_train), type(X_test), type(y_train), type(y_test))

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

In [None]:
max_words

In [None]:
# Building the model
model = Sequential()
model.add(layers.Embedding(max_words, 40, input_length=1000))
model.add(layers.Bidirectional(layers.LSTM(20,dropout=0.6)))
model.add(layers.Dense(3,activation='softmax'))
model.compile(optimizer='rmsprop',loss='categorical_crossentropy', metrics=['accuracy'])


In [None]:
# len(X_train[-100_000:])

In [None]:
history = model.fit(X_train[-5_000:], y_train[-5_000:], epochs=5,validation_data=(X_test[:5000], y_test[:5000]))

In [None]:
#Validating model
test_loss, test_acc = model.evaluate(X_test, y_test, verbose=2)
print('Model accuracy: ',test_acc)