In [3]:
import gensim 

In [5]:
import pandas as pd

In [6]:
wiki_data = pd.read_csv('data/wikipedia/cleaned_wiki_data_full_text_chunks.csv')

In [23]:
def read_corpus(list_of_docs, tokens_only=False):
    for i, line in enumerate(list_of_docs):
        if tokens_only:
            yield gensim.utils.simple_preprocess(line)
        else:
            # For training data, add tags
            yield gensim.models.doc2vec.TaggedDocument(gensim.utils.simple_preprocess(line), [i])

In [24]:
train_corpus = list(read_corpus(wiki_data.text))
test_corpus = list(read_corpus(wiki_data.text, tokens_only=True))

In [25]:
model = gensim.models.doc2vec.Doc2Vec(vector_size=50, min_count=2, epochs=40)

In [26]:
model.build_vocab(train_corpus)

In [27]:
%time model.train(train_corpus, total_examples=model.corpus_count, epochs=model.epochs)

CPU times: user 50min 40s, sys: 1min 25s, total: 52min 6s
Wall time: 20min 14s


In [68]:
statement = gensim.utils.simple_preprocess('The domestic cat (Felis silvestris catus or Felis catus)[1][4] is a small, typically furry, carnivorous mammal. They are often called house cats[5] when kept as indoor pets or simply cats when there is no need to distinguish them from other felids and felines. They are often valued by humans for companionship and for their ability to hunt vermin. There are more than seventy cat breeds recognized by various cat registries.')
statement = gensim.utils.simple_preprocess('Bouldering is a form of rock climbing that is performed on small rock formations or artificial rock walls, known as boulders, without the use of ropes or harnesses. While it can be done without any equipment, most climbers use climbing shoes to help secure footholds, chalk to keep their hands dry and provide a firmer grip, and bouldering mats to prevent injuries from falls. Unlike free solo climbing, which is also performed without ropes, bouldering problems (the sequence of moves that a climber performs to complete the climb) are usually less than 6 meters (20 ft.) tall. Traverses, which are a form of boulder problem, require the climber to climb horizontally from one end to another. [1]:3 Artificial climbing walls allow boulderers to train indoors in areas without natural boulders. In addition, Bouldering competitions take place in both indoor and outdoor settings')
statement = gensim.utils.simple_preprocess('Wrestling is a combat sport involving grappling type techniques such as clinch fighting, throws and takedowns, joint locks, pins and other grappling holds. The sport can either be theatrical for entertainment (see professional wrestling), or genuinely competitive. A wrestling bout is a physical competition, between two (occasionally more) competitors or sparring partners, who attempt to gain and maintain a superior position. There are a wide range of styles with varying rules with both traditional historic and modern styles. Wrestling techniques have been incorporated into other martial arts as well as military hand-to-hand combat systems.')
test_vector = model.infer_vector(statement)

In [69]:
model.docvecs.most_similar([test_vector])

  if np.issubdtype(vec.dtype, np.int):


[(17651, 0.7430974841117859),
 (90981, 0.7396022081375122),
 (67709, 0.7319555282592773),
 (47976, 0.7282230854034424),
 (39991, 0.7220035791397095),
 (44991, 0.7101404070854187),
 (15456, 0.7101379632949829),
 (96825, 0.6990391612052917),
 (96150, 0.6983598470687866),
 (135050, 0.6983575820922852)]

In [76]:
test = clean(test_corpus[90981])
pos_tag(test)

[('sports', 'NNS'),
 ('japan', 'VBP'),
 ('part', 'NN'),
 ('culture', 'NN'),
 ('japan', 'NN'),
 ('sports', 'NNS'),
 ('sumo', 'VBP'),
 ('judo', 'JJ'),
 ('karate', 'NN'),
 ('sports', 'NNS'),
 ('imported', 'VBN'),
 ('baseball', 'NN'),
 ('soccer', 'NN'),
 ('golf', 'NN'),
 ('skiing', 'VBG'),
 ('sports', 'NNS'),
 ('japan', 'NN'),
 ('encyclopedia', 'VBP'),
 ('sports', 'NNS'),
 ('participant', 'JJ'),
 ('participants', 'NNS'),
 ('onlooker', 'NN'),
 ('onlookers', 'NNS')]

In [83]:
import pickle as p

with open('models/simple_wiki_chunked_corpus.p', 'wb') as topick:
    p.dump(test_corpus, topick)


In [65]:
import os
import random
import codecs
from collections import defaultdict

from gensim.models.ldamodel import LdaModel as Lda
from gensim import corpora
from nltk.corpus import stopwords 
from nltk.stem.porter import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import word_tokenize
from nltk import pos_tag

import enchant
spelling_dict = enchant.Dict("en_US")

stop = set(stopwords.words('english'))
lemma = WordNetLemmatizer()
stemmer = PorterStemmer()

stemmed_dict = defaultdict(set)

def stem_and_update_stem_dict(tokens):
    output_list = []
    for token in tokens:
        stemmed = stemmer.stem(token)
        if stemmed != token:
            stemmed_dict[stemmed].add(token)
        output_list.append(stemmed)
    return output_list
        
list_of_POS_to_ignore = ['WRB', 'WP$', 'WP',  'WDT', 'UH', 
                         'TO', 'RP', 'RBS', 'RB', 'RBR', 'PRP$', 'PRP', 
                        'MD', 'JJS', 'JJR', 'JJ', 'IN', 'FW', 'EX', 
                         'DT', 'CD']

# Function to remove stop words from sentences & lemmatize verbs. 
def clean(doc):
    #removing stop words 
    tokens = [i for i in doc if i not in stop]
    
    # removing pos data 
    tokens = [word for word, pos in pos_tag(tokens) if pos not in list_of_POS_to_ignore]
    # Removing improperly spelled words (pronouns must be capitalized to be spelled right)
    tokens = [word for word in tokens if spelling_dict.check(word)]
    # lowercase
    tokens = [word.lower() for word in tokens]
    # lemmatized
#     tokens = [lemma.lemmatize(word, 'v') for word in tokens]
    # removing short words 
    tokens = [s for s in tokens if len(s) > 2]
    # stemmed
#     tokens = stem_and_update_stem_dict(tokens)
    
    return tokens

In [75]:
model.save('models/simple_wiki_chunked_doc2vec')