This notebook calculates the similarity scores between words of the craving corpus via word vectors from Gensim

In [None]:
import pandas as pd
import gensim.downloader as api
text8_path = api.load('text8', return_path=True)
print("Using corpus from", text8_path)

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Using corpus from /root/gensim-data/text8/text8.gz


In [None]:
cravings = pd.read_csv("Data/craving_words.csv") #use all_words.csv alternatively

cravings

Unnamed: 0,post
0,The cravings….\nThe cravings suck. I’m only da...
1,Day 235 — would like to tune my sleep\nIf I ea...
2,Hard few days\nHit my first rough patch after ...
3,Stories on delirium tremens?\nDoes anyone have...
4,Day 5 today! Woot woot!\nSo happy that today i...
...,...
40587,Stages of grief in early recovery?! Bargaining...
40588,Back to square one\nWent to the grocery store ...
40589,Day 1 back agian\nHi there I made it through t...
40590,Do you have advice for dealing with cravings?\...


Preprocessing

In [None]:
!pip install nltk
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')

tokenizer = RegexpTokenizer(r'\w+')
cravings['post'].dropna(inplace=True)
cravings['post'] = cravings['post'].str.lower()
cravings['post_tokens'] = cravings['post'].astype(str).apply(tokenizer.tokenize)

# Make a list of english stopwords
stopwords = nltk.corpus.stopwords.words("english")
cravings['post_tokens'] = cravings['post_tokens'].apply(lambda x: [item for item in x if item not in stopwords])

wordnet_lem = WordNetLemmatizer()

cravings['post_lemmas'] = cravings['post_tokens'].apply(lambda l: [wordnet_lem.lemmatize(x) for x in l])

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
cravings

Unnamed: 0,post,post_tokens,post_lemmas
0,the cravings….\nthe cravings suck. i’m only da...,"[cravings, cravings, suck, day, 2, think, want...","[craving, craving, suck, day, 2, think, wantin..."
1,day 235 — would like to tune my sleep\nif i ea...,"[day, 235, would, like, tune, sleep, eat, clos...","[day, 235, would, like, tune, sleep, eat, clos..."
2,hard few days\nhit my first rough patch after ...,"[hard, days, hit, first, rough, patch, two, we...","[hard, day, hit, first, rough, patch, two, wee..."
3,stories on delirium tremens?\ndoes anyone have...,"[stories, delirium, tremens, anyone, experienc...","[story, delirium, tremens, anyone, experience,..."
4,day 5 today! woot woot!\nso happy that today i...,"[day, 5, today, woot, woot, happy, today, day,...","[day, 5, today, woot, woot, happy, today, day,..."
...,...,...,...
40587,stages of grief in early recovery?! bargaining...,"[stages, grief, early, recovery, bargaining, p...","[stage, grief, early, recovery, bargaining, ph..."
40588,back to square one\nwent to the grocery store ...,"[back, square, one, went, grocery, store, avoi...","[back, square, one, went, grocery, store, avoi..."
40589,day 1 back agian\nhi there i made it through t...,"[day, 1, back, agian, hi, made, today, recentl...","[day, 1, back, agian, hi, made, today, recentl..."
40590,do you have advice for dealing with cravings?\...,"[advice, dealing, cravings, days, cravings, re...","[advice, dealing, craving, day, craving, reall..."


Train Gensim word2vec model

In [None]:
from gensim.models import Word2Vec, KeyedVectors
from gensim.models.word2vec import Text8Corpus

# Using params from Word2Vec_FastText_Comparison

params = {
    'alpha': 0.05,
    'size': 200,
    'window': 10,
    'iter': 5,
    'min_count': 10,#for craving 30 or 50
    'sample': 1e-4,
    'sg': 1,
    'hs': 0,
    'negative': 10,
}

model = Word2Vec(sentences = craving_words, **params)
#model = Word2Vec(Text8Corpus(text8_path), **params)
wv = model.wv
print("Using trained model", wv)

Using trained model <gensim.models.keyedvectors.Word2VecKeyedVectors object at 0x7fe1dae1de50>


In [None]:
print(wv.similarity('craving', 'weekend'))
print(wv.similarity('craving', 'monday'))
print(wv.similarity('craving', 'tuesday'))
print(wv.similarity('craving', 'wednesday'))
print(wv.similarity('craving', 'thursday'))
print(wv.similarity('craving', 'friday'))
print(wv.similarity('craving', 'saturday'))
print(wv.similarity('craving', 'sunday'))
print(wv.similarity('crave', 'weekend'))
print(wv.similarity('crave', 'night'))

0.3554439
0.31156003
0.24855869
0.2307463
0.3001913
0.40442982
0.37161648
0.31790525


In [None]:
!pip install annoy
from gensim.similarities.index import AnnoyIndexer

# 100 trees are being used in this example
annoy_index = AnnoyIndexer(model, 100)


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting annoy
  Downloading annoy-1.17.1.tar.gz (647 kB)
[K     |████████████████████████████████| 647 kB 5.0 MB/s 
[?25hBuilding wheels for collected packages: annoy
  Building wheel for annoy (setup.py) ... [?25l[?25hdone
  Created wheel for annoy: filename=annoy-1.17.1-cp37-cp37m-linux_x86_64.whl size=395186 sha256=f62a86ce0adf475da2769184201e2879848813dfbaf6d529c4872d19a28c571b
  Stored in directory: /root/.cache/pip/wheels/81/94/bf/92cb0e4fef8770fe9c6df0ba588fca30ab7c306b6048ae8a54
Successfully built annoy
Installing collected packages: annoy
Successfully installed annoy-1.17.1


  index = AnnoyIndex(num_features)


In [None]:
similar_terms = []
print_annoy_sims = False
print_normal_sims = False
concepts = ["alone", "friend", "family", "partner", "colleague", "home", "school","university", "work", "restaurant", "bar", "party", "workout", "supermarket", "outdoor", "airport","anxious", "sad", "stressed", "tired", "frustrated", "happy", "proud", "bored", "smoke","cigarette"]
for concept in concepts:
    df = pd.DataFrame()
    vector = wv[concept]
    if(print_annoy_sims):
        # The instance of AnnoyIndexer we just created is passed, AnnoyIndexer is an approximate neighbour algorithm by Spotify that is faster, we prefer the other for this use case though
        approximate_neighbors = wv.most_similar([vector], topn=20, indexer=annoy_index)
        # Neatly print the approximate_neighbors and their corresponding cosine similarity values
        print("Approximate Neighbors (via AnnoyIndexer by Spotify)")
        for neighbor in approximate_neighbors:
            print("%-*s  Occ.: %s" % (35,neighbor,wv.vocab[neighbor[0]].count))

    normal_neighbors = wv.most_similar([vector], topn=30)
    if(print_normal_sims):
        print("\nExact Neighbors (via Gensim)")
    for neighbor in normal_neighbors:
        df = df.append({concept : neighbor[0], concept+'_sim' : round(neighbor[1],3), concept+'_count' : wv.vocab[neighbor[0]].count}, ignore_index = True)
        if(print_normal_sims):
            print("%-*s  Occ.: %s" % (35,neighbor,wv.vocab[neighbor[0]].count))
    similar_terms.append(df)  

similarities = pd.concat(similar_terms, axis=1)
similarities

Unnamed: 0,smoke,smoke_count,cigarette,cigarette_count
0,smoke,1213.0,cigarette,1102.0
1,cigarette,1102.0,smoking,1914.0
2,weed,1598.0,smoke,1213.0
3,smoking,1914.0,cigs,135.0
4,smoked,437.0,chimney,12.0
5,cigs,135.0,smoked,437.0
6,smoker,180.0,smoker,180.0
7,chimney,12.0,juul,16.0
8,juul,16.0,tobacco,162.0
9,puff,30.0,nicotine,296.0


In [None]:
similarities.to_csv('similar_terms.csv', header=True, index=False, columns=list(similarities.axes[1]))

In [None]:
print("vocabulary length is:")
print(len(wv.vocab))

12340
