In [1]:
import pandas as pd
import gensim.downloader as api
from gensim.models import KeyedVectors
from gensim.models import KeyedVectors, Word2Vec, phrases
from gensim.parsing import preprocessing
from gensim.parsing.preprocessing import strip_tags, strip_punctuation,strip_numeric,remove_stopwords
import gensim
import gensim.corpora as corpora
from gensim.models import LdaModel
from os import walk
from os import listdir
from os.path import isfile, join

from pprint import pprint

import pickle

from matplotlib import pyplot as plt

import numpy as np
import seaborn as sns
import matplotlib.colors as mcolors

In [2]:
#  Clean data
def text_cleaning(data):
    new_sentences = []
    filters = [lambda x: x.lower(), strip_tags, strip_punctuation,strip_numeric,remove_stopwords]
    no_findings = []
    for i in range(data.shape[0]):
        txt = data.iloc[i]
        c_words = []
        words = preprocessing.preprocess_string(txt, filters)
        for w in words:
            if len(w)>3 :
                c_words.append(w)
            new_sentences.append(c_words)
    
    return new_sentences

In [3]:
data = pd.read_csv("Breast_Cancer.csv")['Text']
sentences = text_cleaning(data)

In [4]:
data = pd.read_csv("Breast_Cancer.csv")['Text']
sentences = text_cleaning(data)

In [5]:
# create Bigrams
bigram = gensim.models.Phrases(sentences) 
bigram_mod = gensim.models.phrases.Phraser(bigram)
data_ready = [bigram_mod[doc] for doc in sentences]

In [6]:
# Create Dictionary
id2word = corpora.Dictionary(data_ready)

# Create Corpus: Term Document Frequency
corpus = [id2word.doc2bow(text) for text in data_ready]

In [13]:
model_scratch = Word2Vec(workers=8,size=100, iter=10, min_count=1, 
                        window = 4, negative = 5) # create the model object


model_tune = Word2Vec(workers=8,size=100, iter=10, min_count=1, 
                        window = 4, negative = 5) # create the model object
    

In [14]:
model_scratch.build_vocab(data_ready) 
total_examples = model_scratch.corpus_count
total_examples

94560

In [15]:
model_tune.build_vocab(data_ready) 
total_examples = model_tune.corpus_count
total_examples

94560

In [16]:
### train from scratch
model_scratch.train(data_ready, total_examples=total_examples, epochs=200)
model_scratch.save("w2vec_scratch.model")

In [17]:
# Load from drug-related w2vec
path_tune = "dms/trig-vectors-phrase.bin"
word_vectors = KeyedVectors.load_word2vec_format(path_tune, binary=True, encoding='utf8', unicode_errors='ignore')

In [None]:
# ------------------------------------------ #####

In [26]:
# Q1
model_load = Word2Vec.load("w2vec_scratch.model")
similar = [x[0] for x in model_load.wv.most_similar("tamoxifen",topn=20)]
similar

['exactly',
 'chemo',
 'want',
 'awesome',
 'marr_metastatic',
 'surgery',
 'return',
 'care',
 'jeopardy',
 'stupid',
 'told',
 'osteoporosis_scan',
 'went',
 'true',
 'fight',
 'years',
 'clear_margins',
 'immune_compromising',
 'letrozole_femara',
 'tweets']

In [55]:
# Q2 - Best ration 0.75 with 4 words - tamoxifin tamoxafin tamoxifan tomoxifen 
import Levenshtein
similar = [x[0] for x in word_vectors.most_similar("tamoxifen",topn=10000) ]
similar[0:100]

['cyclosporine',
 'arimidex',
 'cyclophosphamide',
 'oral_contraceptives',
 'methotrexate_mtx',
 'letrozole',
 'dapsone',
 'clomiphene',
 'parlodel',
 'oral_contraceptive',
 'methatrexate',
 'azathioprine_imuran',
 'byetta',
 'dexamethasone',
 'methotraxate',
 'sutent',
 'immunosuppressant_drugs',
 'carbergoline',
 'postmenopausal_osteoporosis',
 'tnf_inhibitors',
 'sulphasalazine',
 'aromatase_inhibitors',
 'cyclosporin',
 'cytoxan',
 'cholesterol_lowering_drugs',
 'nolvadex',
 'valproate',
 'lupron_depot',
 'mesalazine',
 'glyburide',
 'mercaptopurine',
 'neoral',
 'nexavar',
 'birth_control_pills',
 'testosterone_replacement',
 'evista',
 'cimetidine',
 'levothroid',
 'moclobemide',
 'azathioprine',
 'glucophage',
 'birth_control_pill',
 'folinic_acid',
 'injectible',
 'cabergoline_dostinex',
 'plaquenil_methotrexate',
 'st_john’s_wort',
 'corticosteroids',
 'warfarin_coumadin',
 'adriamycin',
 'methrotrexate',
 'raptiva',
 'spironolactone',
 'folfiri',
 'purinethol',
 'hydroxychlor

In [43]:
def get_mispelling(word, similar, ratio):
    out = []
    for w in similar:
        lev_ratio = Levenshtein.ratio(word, w)
        if lev_ratio > ratio:
            out.append(w)
    return out

In [48]:
words = get_mispelling("tamoxifen", similar, 0.6)
for w in words:
    print(w, end=' ')

tamoxifin cytoxin tamoxafin raloxifene tamox tamoxifan lanoxin tamsulosin tomoxifen amoxapine rutaxin amoxicilin amox tovaxin amoxcillin raloxifine amoxil amoxicllin amoxillin 

In [49]:
words = get_mispelling("tamoxifen", similar, 0.7)
for w in words:
    print(w, end=' ')

tamoxifin tamoxafin raloxifene tamox tamoxifan tomoxifen 

In [50]:
# Best
words = get_mispelling("tamoxifen", similar, 0.75)
for w in words:
    print(w, end=' ')

tamoxifin tamoxafin tamoxifan tomoxifen 

In [51]:
words = get_mispelling("tamoxifen", similar, 0.8)
for w in words:
    print(w, end=' ')

tamoxifin tamoxifan tomoxifen 

In [53]:
words = get_mispelling("tamoxifen", similar, 0.85)
for w in words:
    print(w, end=' ')

tamoxifin tamoxifan tomoxifen 