**CREATING CORPUS**

In [None]:
from keras.preprocessing.text import Tokenizer
from gensim.models.fasttext import FastText
from gensim.models import Word2Vec
import wikipedia
import re
import nltk
from string import punctuation
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import sent_tokenize
from nltk import WordPunctTokenizer

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
en_stop = set(nltk.corpus.stopwords.words('english'))

from nltk.stem import WordNetLemmatizer

stemmer = WordNetLemmatizer()

def preprocess_text(text):

    # removing all the special characters
    text = re.sub(r'\W', ' ', str(text))

    # removing all the single characters
    text = re.sub(r'\s+[a-zA-Z]\s+', ' ', text)

    # removing multiple spaces with single space
    text = re.sub(r'\s+', ' ', text, flags=re.I).strip()

    text = text.lower()

    # lemmatization
    token_list = text.split()
    token_list = [stemmer.lemmatize(word) for word in token_list]
    token_list = [word for word in token_list if word not in en_stop]
    token_list = [word for word in token_list if len(word) > 3]

    preprocessed_text = ' '.join(token_list)

    return preprocessed_text


def get_content_from_wikipedia(language, topic_list):

  wikipedia.set_lang(language)

  content = ""

  for topic in topic_list:
    content += wikipedia.page(topic).content  

  sentence_list = sent_tokenize(content)

  return sentence_list


def get_word_tokenized_corpus(sentence_list):
  final_corpus = [preprocess_text(sentence) for sentence in sentence_list if sentence.strip() !='']
  word_punctuation_tokenizer = nltk.WordPunctTokenizer()
  word_tokenized_corpus = [word_punctuation_tokenizer.tokenize(sent) for sent in final_corpus]

  return word_tokenized_corpus


topic_list = ["artificial intelligence", "deep learning", "genetic algorithms", 
                "recommendation systems", "optimization", "technology", 
                "natural language processing", "pattern recognition"]

sentence_list = get_content_from_wikipedia('en', topic_list)
word_tokenized_corpus = get_word_tokenized_corpus(sentence_list)



**BUILDING EMBEDDING MODELS**

In [10]:
embedding_size = 60
window_size = 40
min_word = 5
down_sampling = 1e-2

ft_model = FastText(word_tokenized_corpus, size=embedding_size, window=window_size, min_count=min_word, sample=down_sampling, sg=1, iter=100)
wv_model = Word2Vec(word_tokenized_corpus, size=embedding_size, window=window_size, min_count=min_word, sample=down_sampling, sg=1, iter=100)

**CREATING TSV FILE**

In [None]:
import csv

with open('wv_embeddings.tsv', 'w') as tsvfile:
    writer = csv.writer(tsvfile, delimiter='\t')
    words = ft_model.wv.vocab.keys()
    for word in words:
        vector = ft_model.wv.get_vector(word).tolist()
        row = [word] + vector
        writer.writerow(row)