### Load the data

In [1]:
# read data from text files
with open('data/tweets_full.txt', 'r') as f:
    reviews = f.read()

In [2]:
from string import punctuation

# get rid of punctuation
reviews = reviews.lower() # lowercase, standardize
all_text = ''.join([c for c in reviews if c not in punctuation])

# split by new lines and spaces
reviews_split = all_text.split('\n')
sentences = [review.split(" ") for review in reviews_split]
sentences.append(["none"])
print(sentences[-10:])

[['jasimmo', 'ooo', 'showing', 'of', 'your', 'french', 'skills', 'lol', 'things', 'good', 'over', 'here', 'lovely', 'weather', 'so', 'should', 'be', 'outside', '', 'hows', 'u', ''], ['sendsome2me', 'haha', 'yeah', 'twitter', 'has', 'many', 'uses', 'for', 'me', 'its', 'just', 'to', 'know', 'what', 'the', 'ppl', 'i', 'care', 'about', 'are', 'doing', ''], ['succesfully', 'following', 'tayla', ''], ['johnlloydtaylor', ''], ['happy', 'mothers', 'day', '', 'all', 'my', 'love', ''], ['happy', 'mothers', 'day', 'to', 'all', 'the', 'mommies', 'out', 'there', 'be', 'you', 'woman', 'or', 'man', 'as', 'long', 'as', 'youre', 'momma', 'to', 'someone', 'this', 'is', 'your', 'day', ''], ['niariley', 'wassup', 'beautiful', 'follow', 'me', '', 'peep', 'out', 'my', 'new', 'hit', 'singles', 'wwwmyspacecomipsohot', 'i', 'def', 'wat', 'u', 'in', 'the', 'video', ''], ['mopedronin', 'bullet', 'train', 'from', 'tokyo', '', '', '', 'the', 'gf', 'and', 'i', 'have', 'been', 'visiting', 'japan', 'since', 'thursday

### Train word2vec with the data

In [3]:
from gensim.models import Word2Vec


# define training data

# train model
model = Word2Vec(sentences, min_count=1, size=300)
# summarize the loaded model
print(model)
# summarize vocabulary
words = list(model.wv.vocab)
print(words)
# access vector for one word
print(model['funeral'])
# save model
model.save('trained_w2v.bin')

Word2Vec(vocab=53746, size=300, alpha=0.025)
[-2.74211038e-02 -2.65432168e-02 -1.48387015e-01  4.23575789e-02
  3.51253040e-02 -3.10296249e-02 -2.23468225e-02 -3.44974808e-02
 -2.84659956e-02 -3.77147943e-02 -1.25232124e-04 -4.59737098e-03
 -5.75926621e-04  4.76132110e-02 -5.23192771e-02  2.98923329e-02
 -5.76919038e-03 -2.46865395e-02 -7.16227852e-03  8.67475476e-03
 -1.82534847e-02 -2.79901158e-02  1.70485508e-02 -3.99463214e-02
  5.35748154e-03 -1.05881039e-02 -1.08834673e-02 -2.14016344e-02
  9.80516896e-03  5.35947531e-02  2.31307745e-02  3.17522921e-02
  7.62290228e-03  5.22554154e-03  1.07991544e-03  2.21502315e-02
  3.75422724e-02  8.22361652e-03  5.62417135e-03  9.51129477e-03
  1.66402999e-02  3.01900296e-03 -6.15451783e-02 -6.50617806e-03
 -1.00727575e-02  2.76215672e-02  1.00510791e-02 -4.45844643e-02
  6.16558082e-02  4.38489094e-02 -8.24384298e-03 -2.43484583e-02
  2.42077075e-02 -4.70486982e-03  2.22886819e-02  2.45033521e-02
 -4.42474931e-02 -6.58976287e-02  5.43162823e

  print(model['funeral'])


In [4]:
from gensim.models import KeyedVectors

# load model
new_model = Word2Vec.load('trained_w2v.bin')
print(new_model)
print(len(new_model.wv['none']))
print("none" in words)

new_model = KeyedVectors.load('trained_w2v.bin')
print(new_model.wv.vectors)

Word2Vec(vocab=53746, size=300, alpha=0.025)
300
True
[[-4.2674062e-01  1.0696528e-01 -1.4644409e+00 ...  4.9810160e-02
   2.0994665e-01  3.0871007e-01]
 [-1.2648560e+00  2.7750504e-01 -1.3830882e+00 ... -2.6240353e-02
   5.7821620e-01  3.2545090e-01]
 [-2.6542953e-01 -3.4512815e-01 -4.5403066e-01 ...  4.4313240e-01
   3.5355222e-01 -2.0003715e-01]
 ...
 [-4.9631060e-03 -1.6257972e-03 -1.8345876e-02 ...  2.1563626e-03
  -6.7100041e-03  2.7423985e-03]
 [-5.2258233e-03 -1.2848869e-03 -1.5606534e-02 ... -1.0143375e-03
  -1.7633450e-03  1.0247926e-03]
 [-3.0858757e-03 -2.2082239e-04 -1.2977008e-02 ...  2.3611996e-03
  -3.9495192e-03  2.3848482e-03]]


### Import Google Pre-trained model

In [5]:
from gensim.models import KeyedVectors
filename = 'GoogleNews-vectors-negative300.bin'
model = KeyedVectors.load_word2vec_format(filename, binary=True)

In [6]:
try:
    print(model[['Hello', 'my']])
except:
    print("hello")

[[-5.10253906e-02  1.20605469e-01 -1.25732422e-02  3.08837891e-02
  -8.54492188e-02  5.34667969e-02 -1.39648438e-01 -1.63085938e-01
   8.30078125e-02  2.05078125e-01 -9.42382812e-02  1.55273438e-01
  -4.18090820e-03  2.11181641e-02 -1.09863281e-01  2.24609375e-01
   2.22656250e-01  1.59179688e-01  5.78613281e-02 -1.55273438e-01
   2.30468750e-01  3.47656250e-01  4.43359375e-01 -1.45507812e-01
   1.51367188e-01  2.20947266e-02 -1.63085938e-01  2.57812500e-01
   2.00195312e-01  6.05468750e-02 -2.22167969e-02 -1.31835938e-01
  -6.39648438e-02 -1.27929688e-01  9.81445312e-02 -1.35742188e-01
   1.92871094e-02  2.51953125e-01  1.42578125e-01  2.42187500e-01
   3.17382812e-02 -1.70898438e-01  2.61718750e-01  4.29687500e-01
   2.48046875e-01 -1.03515625e-01 -1.38671875e-01 -1.42578125e-01
  -2.07031250e-01 -8.60595703e-03 -4.33593750e-01 -1.75781250e-02
   4.02343750e-01  3.26171875e-01  3.32031250e-01  1.00097656e-02
  -1.15234375e-01 -1.26953125e-01  1.94091797e-02 -2.53906250e-01
   3.93676