**CREATING CORPUS**

In [None]:
from keras.preprocessing.text import Tokenizer
from gensim.models.fasttext import FastText
from gensim.models import Word2Vec
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import wikipedia
import re

nltk.download('punkt')
nltk.download('stopwords')


def preprocess_text(text):

  text = text.replace("â", "a")
  # when lowering 'İ' a different character is produced similar to "i"
  text = text.replace("İ", "i").replace("I", "i").lower()
  text = text.replace(".", " ").replace(",", " ").replace("?", " ").replace("!", " ")
  
  # remove special characters
  exception_str = "çğıöşü"
  text = re.sub('[^a-z ' + exception_str + ']', '', text)

  # remove single characters 
  text = ' '.join([w for w in text.split() if len(w)>1])

  # remove multiple space charackters
  text = re.sub(r'\s+', ' ', text, flags=re.I).strip()

  return text


def get_content_from_wikipedia(language, topic_list):

  wikipedia.set_lang(language)

  content = ""

  for topic in topic_list:
    content += wikipedia.page(topic).content  

  sentence_list = sent_tokenize(content)

  return sentence_list


def get_word_tokenized_corpus(sentence_list):

  additional_stop_words = ["başka", "diğer", "bir", "iki", "üç", "dört", 
                           "beş", "altı", "yedi", "sekiz", "dokuz", "on"]
  stop_word_list = nltk.corpus.stopwords.words('turkish') + additional_stop_words

  word_tokenized_corpus = []

  for sentence in sentence_list:
    sentence = preprocess_text(sentence)
    if len(sentence) > 0:
      word_tokenized_corpus.append([token for token in sentence.split(" ") if token not in stop_word_list])

  return word_tokenized_corpus


topic_list = ["yapay zeka", "makine öğrenmesi", "derin öğrenme", 
              "genetik algoritma", "tavsiye sistemleri", "optimizasyon", 
              "teknoloji", "yapay sinir ağları", 
              "doğal dil işleme", "örüntü tanıma"]
sentence_list = get_content_from_wikipedia('tr', topic_list)
word_tokenized_corpus = get_word_tokenized_corpus(sentence_list)

**BUILDING EMBEDDING MODELS**

In [26]:
embedding_size = 60
window_size = 40
min_word = 5
down_sampling = 1e-2

ft_model = FastText(word_tokenized_corpus, size=embedding_size, window=window_size, min_count=min_word, sample=down_sampling, sg=1, iter=100)
wv_model = Word2Vec(word_tokenized_corpus, size=embedding_size, window=window_size, min_count=min_word, sample=down_sampling, sg=1, iter=100)

**CREATING TSV FILE**

In [None]:
import csv

with open('wv_embeddings.tsv', 'w') as tsvfile:
    writer = csv.writer(tsvfile, delimiter='\t')
    words = ft_model.wv.vocab.keys()
    for word in words:
        vector = ft_model.wv.get_vector(word).tolist()
        row = [word] + vector
        writer.writerow(row)