In [6]:
import re
import json
import nltk
import warnings
import pymorphy2
import pandas as pd
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

from nltk.stem import SnowballStemmer

warnings.filterwarnings("ignore")
stops_ru = set(stopwords.words('russian'))
stops_en = set(stopwords.words('english'))
stops_es = set(stopwords.words('spanish'))

#Стэммер для испанского языка
stemmer_spanish = SnowballStemmer('spanish')

In [None]:
path = '../DO_record_per_line.json'

In [None]:
data = pd.read_json(path, lines=True)
data.head()

In [None]:
data.info()

In [None]:
pred_course = [16871,12234,9029,18030,8170,1200]

In [None]:
regex = re.compile(r'[\w\d]{2,}', re.U)

In [None]:
data["desc_words"] = data["desc"].apply(lambda x: regex.findall(x.lower()))
data.head()

In [None]:
morph = pymorphy2.MorphAnalyzer()
data["desc_words"] = data["desc_words"].apply(lambda x: [morph.parse(tok)[0].normal_form for tok in x])
data.head()

In [None]:
#Уникальные языки курсов
data["lang"].value_counts()

In [None]:
print(data[data["id"].isin(pred_course)]["lang"].value_counts())
data[data["id"].isin(pred_course)]

In [None]:
data_ru = data[data["lang"] == "ru"].reset_index()
print("RU - ", data_ru.shape)

data_en = data[data["lang"] == "en"].reset_index()
print("EN - ", data_en.shape)

data_es = data[data["lang"] == "es"].reset_index()
print("ES - ", data_es.shape)

Удаляем стоп-слова

In [None]:
def remove_stopwords(tokens, stopwords=None, min_length=4):
    if not stopwords:
        return tokens
    stopwords = set(stopwords)
    tokens = [tok
              for tok in tokens
              if tok not in stopwords and len(tok) >= min_length]
    return tokens

data_ru["desc_words"] = data_ru["desc_words"].apply(lambda x: remove_stopwords(x,stops_ru))
data_en["desc_words"] = data_en["desc_words"].apply(lambda x: remove_stopwords(x,stops_en))
data_es["desc_words"] = data_es["desc_words"].apply(lambda x: remove_stopwords(x,stops_es))
data_ru.head()

In [None]:
data_ru = data_ru.sort_values("id")
data_en = data_en.sort_values("id")
data_es = data_es.sort_values("id")
data_ru.head()

#### Считаем косинусную меру и TF-IDF

Русские тексты

In [None]:
#8170
#1200

In [None]:
ind1 = data_ru[data_ru["id"] == 1200]
ind2 = data_ru[data_ru["id"] == 8170]
print(ind1.index)
print(ind2.index)

In [None]:
data_ru_t = data_ru.copy()
data_ru_t.head()

In [None]:
b, c = data_ru.iloc[0], data_ru.iloc[444]
temp = data_ru.iloc[0].copy()
data_ru.iloc[0] = c
data_ru.iloc[444] = temp
data_ru.head()

In [None]:
%%time
vectorizer_rus = TfidfVectorizer()
data_ru["desc_words"] = data_ru["desc_words"].apply(' '.join)
matrix = vectorizer_rus.fit_transform(data_ru["desc_words"]).toarray()

In [None]:
cos_m = cosine_similarity(matrix[0].reshape(1,-1), matrix[1:])
cos_m.shape

In [None]:
cos_m = cos_m.reshape(1230,1)
cos_m

In [None]:
cos_df = pd.DataFrame(cos_m)
cos_df.head()

In [None]:
index_list_ru_1200 = cos_df.sort_values(0,ascending=False).head(10).index
index_list_ru_1200

In [None]:
index_list_ru_1200 = index_list_ru_1200 + 1
index_list_ru_1200 = data_ru.ix[index_list_ru_1200]["id"].tolist()
index_list_ru_1200

In [None]:
data_ru = data_ru_t.copy()
data_ru.head()

In [None]:
b, c = data_ru.iloc[0], data_ru.iloc[739]
temp = data_ru.iloc[0].copy()
data_ru.iloc[0] = c
data_ru.iloc[739] = temp
data_ru.head()

In [None]:
%%time
vectorizer_rus = TfidfVectorizer()
data_ru["desc_words"] = data_ru["desc_words"].apply(' '.join)
matrix = vectorizer_rus.fit_transform(data_ru["desc_words"]).toarray()

In [None]:
cos_m = cosine_similarity(matrix[0].reshape(1,-1), matrix[1:])
cos_m.shape

In [None]:
cos_m = cos_m.reshape(1230,1)
cos_m

In [None]:
cos_df = pd.DataFrame(cos_m)
cos_df.head()

In [None]:
index_list_ru_8170 = cos_df.sort_values(0,ascending=False).head(10).index
index_list_ru_8170

In [None]:
index_list_ru_8170 = index_list_ru_8170 + 1
index_list_ru_8170 = data_ru.ix[index_list_ru_8170]["id"].tolist()
index_list_ru_8170

Английский

In [None]:
#12234
#16871
ind1 = data_en[data_en["id"] == 12234]
ind2 = data_en[data_en["id"] == 16871]
print(ind1.index)
print(ind2.index)

In [None]:
data_en_t = data_en.copy()
data_en_t.head()

In [None]:
b, c = data_en.iloc[0], data_en.iloc[14517]
temp = data_en.iloc[0].copy()
data_en.iloc[0] = c
data_en.iloc[14517] = temp
data_en.head()

In [None]:
%%time
vectorizer_en = TfidfVectorizer()
data_en["desc_words"] = data_en["desc_words"].apply(' '.join)
matrix = vectorizer_en.fit_transform(data_en["desc_words"]).toarray()

In [None]:
cos_m = cosine_similarity(matrix[0].reshape(1,-1), matrix[1:])
cos_m.shape

In [None]:
cos_m = cos_m.reshape(24552,1)
cos_m

In [None]:
cos_df = pd.DataFrame(cos_m)
cos_df.head()

In [None]:
index_list_en_16871 = cos_df.sort_values(0,ascending=False).head(10).index
index_list_en_16871

In [None]:
index_list_en_16871 = index_list_en_16871 + 1
index_list_en_16871 = data_en.ix[index_list_en_16871]["id"].tolist()
index_list_en_16871

In [None]:
data_en = data_en_t.copy()
data_en.head()

In [None]:
b, c = data_en.iloc[0], data_en.iloc[10332]
temp = data_en.iloc[0].copy()
data_en.iloc[0] = c
data_en.iloc[10332] = temp
data_en.head()

In [None]:
%%time
vectorizer_en = TfidfVectorizer()
data_en["desc_words"] = data_en["desc_words"].apply(' '.join)
matrix = vectorizer_en.fit_transform(data_en["desc_words"]).toarray()

In [None]:
cos_m = cosine_similarity(matrix[0].reshape(1,-1), matrix[1:])
cos_m.shape

In [None]:
cos_m = cos_m.reshape(24552,1)
cos_m

In [None]:
cos_df = pd.DataFrame(cos_m)
cos_df.head()

In [None]:
index_list_en_12234 = cos_df.sort_values(0,ascending=False).head(10).index
index_list_en_12234

In [None]:
index_list_en_12234 = index_list_en_12234 + 1
index_list_en_12234 = data_en.ix[index_list_en_12234]["id"].tolist()
index_list_en_12234

Испанский

In [None]:
#9029
#18030
ind1 = data_es[data_es["id"] == 9029]
ind2 = data_es[data_es["id"] == 18030]
print(ind1.index)
print(ind2.index)

In [None]:
data_es_t = data_es.copy()
data_es_t.head()

In [None]:
b, c = data_es.iloc[0], data_es.iloc[256]
temp = data_es.iloc[0].copy()
data_es.iloc[0] = c
data_es.iloc[256] = temp
data_es.head()

In [None]:
%%time
vectorizer_es = TfidfVectorizer()
data_es["desc_words"] = data_es["desc_words"].apply(' '.join)
matrix = vectorizer_es.fit_transform(data_es["desc_words"]).toarray()

In [None]:
cos_m = cosine_similarity(matrix[0].reshape(1,-1), matrix[1:])
cos_m.shape

In [None]:
cos_m = cos_m.reshape(1373,1)
cos_m

In [None]:
cos_df = pd.DataFrame(cos_m)
cos_df.head()

In [None]:
index_list_es_9029 = cos_df.sort_values(0,ascending=False).head(10).index
index_list_es_9029

In [None]:
index_list_es_9029 = index_list_es_9029 + 1
index_list_es_9029 = data_es.ix[index_list_es_9029]["id"].tolist()
index_list_es_9029

In [None]:
data_es = data_es_t.copy()
data_es.head()

In [None]:
b, c = data_es.iloc[0], data_es.iloc[978]
temp = data_es.iloc[0].copy()
data_es.iloc[0] = c
data_es.iloc[978] = temp
data_es.head()

In [None]:
%%time
vectorizer_es = TfidfVectorizer()
data_es["desc_words"] = data_es["desc_words"].apply(' '.join)
matrix = vectorizer_es.fit_transform(data_es["desc_words"]).toarray()

In [None]:
cos_m = cosine_similarity(matrix[0].reshape(1,-1), matrix[1:])
cos_m.shape

In [None]:
cos_m = cos_m.reshape(1373,1)
cos_m

In [None]:
cos_df = pd.DataFrame(cos_m)
cos_df.head()

In [None]:
index_list_es_18030 = cos_df.sort_values(0,ascending=False).head(10).index
index_list_es_18030

In [None]:
index_list_es_18030 = index_list_es_18030 + 1
index_list_es_18030 = data_es.ix[index_list_es_18030]["id"].tolist()
index_list_es_18030

ВЫГРУЗКА

In [None]:
output = {
    "16871":index_list_en_16871, 
    "12234": index_list_en_12234, 
    "9029": index_list_es_9029, 
    "18030": index_list_es_18030,
    "8170": index_list_ru_8170, 
    "1200": index_list_ru_1200}
output

In [None]:
json_data = json.dumps(output)
json_data

In [None]:
with open('../lab07.json', 'w') as f:
    json.dump(output, f)

In [None]:
with open('../lab07s.json', 'w') as f:
    json.dump(output, f)