In [13]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import re
import sys
import html
import pickle
from pprint import pprint
from sklearn.linear_model import PassiveAggressiveClassifier, SGDClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import HashingVectorizer, TfidfTransformer, TfidfVectorizer, CountVectorizer
from sklearn.svm import LinearSVC, SVC, NuSVC 
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.naive_bayes import BernoulliNB, GaussianNB, MultinomialNB
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import classification_report
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding, GRU
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from gensim.models import Word2Vec

In [2]:
# load training data & test data

names = ('polarity', 'id', 'date', 'query', 'author', 'text')
data_train = pd.read_csv('training.1600000.processed.noemoticon.csv', encoding='latin1', names=names)
data_test = pd.read_csv('testdata.manual.2009.06.14.csv', names=names)

# sample n from 160k tweets
data_train_sample = data_train.sample(400000)

# split into X and y
textcorpus = data_train['text']
text_train_all = data_train_sample['text']
target_train_all = data_train_sample['polarity'].values
target_train_all = target_train_all / 4

# split training csv into training and validation components

text_train_small, text_validation, target_train_small, target_validation = train_test_split(
    text_train_all, target_train_all, test_size=.2, random_state=42)

del data_train, data_test, data_train_sample, target_train_all

In [3]:
#######################
# Tweet text cleaning #
#######################

# Load contractions dictionary
contractions = pickle.load(open("contractions.pickle", "rb"))

# Expand contractions
def contraction_remove(line):
    for word in line.split():
        if word.lower() in contractions:
            line = line.replace(word, contractions[word.lower()])
    return line

# General text cleanup
def text_cleanup(text):
    #Remove &quot; or &amp;
    souped = html.unescape(text)
    #Remove @mentions
    #souped = re.sub(r'@\w+','',souped)
    #Remove http / https links
    souped = re.sub(r'https?://\S*','',souped)
    #Remove all remaining numbers / non-letters
    souped = re.sub("[^a-zA-Z]",' ',souped)
    #All lower case
    #souped = souped.lower()
    return(souped)

# Remove stopwords
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

def stopword_remove(line):
    words = [w.strip() for w in line.split() if not w.strip() in stop_words]
    words = str.join(' ',words)
    return words

# Top level tweet cleaner function
def tweets_clean(tweets):
    #tweets = tweets.apply(lambda x: contraction_remove(x))
    tweets = tweets.apply(lambda x: text_cleanup(x))
    #tweets = tweets.apply(lambda x: stopword_remove(x))
    return tweets

text_train_small = tweets_clean(text_train_small)
text_validation = tweets_clean(text_validation)
text_train_all = tweets_clean(text_train_all)
textcorpus = tweets_clean(textcorpus)

In [4]:
corpustokens = []
for line in textcorpus:
    corpustokens.append(line.split(' '))

In [5]:
corpustokens

[['',
  'switchfoot',
  '',
  '',
  '',
  'Awww',
  '',
  'that',
  's',
  'a',
  'bummer',
  '',
  '',
  'You',
  'shoulda',
  'got',
  'David',
  'Carr',
  'of',
  'Third',
  'Day',
  'to',
  'do',
  'it',
  '',
  '',
  'D'],
 ['is',
  'upset',
  'that',
  'he',
  'can',
  't',
  'update',
  'his',
  'Facebook',
  'by',
  'texting',
  'it',
  '',
  '',
  '',
  'and',
  'might',
  'cry',
  'as',
  'a',
  'result',
  '',
  'School',
  'today',
  'also',
  '',
  'Blah',
  ''],
 ['',
  'Kenichan',
  'I',
  'dived',
  'many',
  'times',
  'for',
  'the',
  'ball',
  '',
  'Managed',
  'to',
  'save',
  '',
  '',
  '',
  '',
  '',
  'The',
  'rest',
  'go',
  'out',
  'of',
  'bounds'],
 ['my',
  'whole',
  'body',
  'feels',
  'itchy',
  'and',
  'like',
  'its',
  'on',
  'fire',
  ''],
 ['',
  'nationwideclass',
  'no',
  '',
  'it',
  's',
  'not',
  'behaving',
  'at',
  'all',
  '',
  'i',
  'm',
  'mad',
  '',
  'why',
  'am',
  'i',
  'here',
  '',
  'because',
  'I',
  'can',
  't

In [6]:
w2v = Word2Vec(corpustokens, size=300)

In [7]:
w2v.most_similar(positive=['happy'])

  """Entry point for launching an IPython kernel.


[('pleased', 0.6263509392738342),
 ('thrilled', 0.5964668393135071),
 ('thankful', 0.5762211084365845),
 ('upset', 0.5726284980773926),
 ('grateful', 0.5704963803291321),
 ('unhappy', 0.5698879361152649),
 ('excited', 0.5605750679969788),
 ('depressed', 0.5534179210662842),
 ('proud', 0.5514568090438843),
 ('exited', 0.5495133996009827)]

In [8]:
seq_lengths = text_train_small.apply(lambda x: len(x.split(' ')))
seq_lengths.describe()

count    320000.000000
mean         18.586037
std           9.632124
min           2.000000
25%          11.000000
50%          17.000000
75%          25.000000
max         356.000000
Name: text, dtype: float64

In [9]:
max_words = 37 # Maximum length of tweet word sequence

tokenizer = Tokenizer()
tokenizer.fit_on_texts(textcorpus)
vocab_size = len(tokenizer.word_index) + 1
tokencorpus = tokenizer.texts_to_sequences(text_train_small)
x_train = tokenizer.texts_to_sequences(text_train_small)
x_valid = tokenizer.texts_to_sequences(text_validation)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % vocab_size)
sequences_train = pad_sequences(x_train, maxlen=max_words)
sequences_valid = pad_sequences(x_valid, maxlen=max_words)

Found 556144 unique tokens.


In [10]:
embeddings_len = len(w2v['hi'])
print("EMBEDDINGS_LEN=", embeddings_len)  # 300
 
embeddings_matrix = np.zeros((vocab_size, embeddings_len))
for word, idx in tokenizer.word_index.items():
    try:
        embedding = w2v[word]
        embeddings_matrix[idx] = embedding
    except:
        pass

embedding_layer = Embedding(vocab_size,
                            embeddings_len,
                            weights=[embeddings_matrix],
                            trainable=False)

  """Entry point for launching an IPython kernel.
  import sys


EMBEDDINGS_LEN= 300


In [11]:
model = Sequential()
model.add(embedding_layer)
model.add(GRU(300))
model.add(Dense(units=1, activation='sigmoid'))
 
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

Instructions for updating:
Colocations handled automatically by placer.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 300)         166843200 
_________________________________________________________________
gru_1 (GRU)                  (None, 300)               540900    
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 301       
Total params: 167,384,401
Trainable params: 541,201
Non-trainable params: 166,843,200
_________________________________________________________________
None


In [12]:
model.fit(sequences_train, target_train_small, batch_size=200, epochs=1, validation_data=(sequences_valid, target_validation))

Instructions for updating:
Use tf.cast instead.
Train on 320000 samples, validate on 80000 samples
Epoch 1/1


<keras.callbacks.History at 0x1b74550710>