In [1]:
import re
import json
import nltk
import warnings
import pymorphy2
import pandas as pd
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

warnings.filterwarnings("ignore")
stops_ru = set(stopwords.words('russian'))
stops_en = set(stopwords.words('english'))
stops_es = set(stopwords.words('spanish'))

#Лемматизация на английском
from nltk.stem.wordnet import WordNetLemmatizer

#Стэммер для испанского языка
from nltk.stem.snowball import SnowballStemmer

In [2]:
path = '../DO_record_per_line.json'

In [3]:
data = pd.read_json(path, lines=True)
data.head()

Unnamed: 0,cat,desc,id,lang,name,provider
0,3/business_management|6/economics_finance,This course introduces the basic financial sta...,4,en,Accounting Cycle: The Foundation of Business M...,Canvas Network
1,11/law,This online course will introduce you to Ameri...,5,en,American Counter Terrorism Law,Canvas Network
2,5/computer_science|15/mathematics_statistics_a...,This course is taught in French Vous voulez co...,6,fr,Arithmétique: en route pour la cryptographie,Canvas Network
3,14/social_sciences,We live in a digitally connected world. The wa...,7,en,Becoming a Dynamic Educator,Canvas Network
4,2/biology_life_sciences,This self-paced course is designed to show tha...,8,en,Bioethics,Canvas Network


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28153 entries, 0 to 28152
Data columns (total 6 columns):
cat         28153 non-null object
desc        28153 non-null object
id          28153 non-null int64
lang        28153 non-null object
name        28153 non-null object
provider    28153 non-null object
dtypes: int64(1), object(5)
memory usage: 1.3+ MB


In [5]:
pred_course = [16871,12234,9029,18030,8170,1200]

In [6]:
regex = re.compile(r'[\w\d]{2,}', re.U)

In [7]:
data["desc_words"] = data["desc"].apply(lambda x: regex.findall(x.lower()))
data.head()

Unnamed: 0,cat,desc,id,lang,name,provider,desc_words
0,3/business_management|6/economics_finance,This course introduces the basic financial sta...,4,en,Accounting Cycle: The Foundation of Business M...,Canvas Network,"[this, course, introduces, the, basic, financi..."
1,11/law,This online course will introduce you to Ameri...,5,en,American Counter Terrorism Law,Canvas Network,"[this, online, course, will, introduce, you, t..."
2,5/computer_science|15/mathematics_statistics_a...,This course is taught in French Vous voulez co...,6,fr,Arithmétique: en route pour la cryptographie,Canvas Network,"[this, course, is, taught, in, french, vous, v..."
3,14/social_sciences,We live in a digitally connected world. The wa...,7,en,Becoming a Dynamic Educator,Canvas Network,"[we, live, in, digitally, connected, world, th..."
4,2/biology_life_sciences,This self-paced course is designed to show tha...,8,en,Bioethics,Canvas Network,"[this, self, paced, course, is, designed, to, ..."


In [8]:
#Уникальные языки курсов
data["lang"].value_counts()

en    24553
es     1374
ru     1231
pt      187
zh      169
de      166
tr      120
fr      104
ja       77
it       62
ar       34
ur       11
he        8
nl        6
ca        6
hi        6
sk        5
el        5
hr        4
ko        3
da        3
fi        2
hu        2
nb        2
af        2
bg        2
uz        1
pl        1
sv        1
et        1
sw        1
vi        1
ms        1
fa        1
id        1
Name: lang, dtype: int64

In [9]:
# Курсы для рекомендаций
print(data[data["id"].isin(pred_course)]["lang"].value_counts())
data[data["id"].isin(pred_course)]

ru    2
en    2
es    2
Name: lang, dtype: int64


Unnamed: 0,cat,desc,id,lang,name,provider,desc_words
1191,5/computer_science,В курсе представлены понятия и положения теори...,1200,ru,Современные операционные системы,Intuit,"[курсе, представлены, понятия, положения, теор..."
8148,5/computer_science,В курсе представлены новые результаты авторов ...,8170,ru,Разработка приложений для мобильных интеллекту...,Intuit,"[курсе, представлены, новые, результаты, автор..."
8999,3/business_management,Curso de Bolsa para principiantes Un curso to...,9029,es,Aprende a Ganar Dinero en la Bolsa by Diego Qu...,Udemy,"[curso, de, bolsa, para, principiantes, un, cu..."
12204,15/mathematics_statistics_and_data_analysis,This graduate-level course is a computationall...,12234,en,Elliptic Curves,MIT OpenCourseWare,"[this, graduate, level, course, is, computatio..."
16826,3/business_management,A course covering International Accounting St...,16871,en,"IAS37 - Provisions, Contingents Liabilities & ...",Udemy,"[course, covering, international, accounting, ..."
17977,,"Prácticas para convertirse en un gran orador,...",18030,es,Ejercicios Preparatorios Para Hablar en Públic...,Udemy,"[prácticas, para, convertirse, en, un, gran, o..."


In [10]:
data_ru = data[data["lang"] == "ru"].reset_index()
print("RU - ", data_ru.shape)

data_en = data[data["lang"] == "en"].reset_index()
print("EN - ", data_en.shape)

data_es = data[data["lang"] == "es"].reset_index()
print("ES - ", data_es.shape)

RU -  (1231, 8)
EN -  (24553, 8)
ES -  (1374, 8)


In [11]:
morph = pymorphy2.MorphAnalyzer()
data_ru["desc_words"] = data_ru["desc_words"].apply(lambda x: [morph.parse(tok)[0].normal_form for tok in x])
data_ru.head()

Unnamed: 0,index,cat,desc,id,lang,name,provider,desc_words
0,42,5/computer_science,Часть 1. Продвинутые структуры данных\r\nПриор...,46,ru,Дополнительные главы алгоритмов,Computer Science Center,"[часть, продвинуть, структура, дать, приоритет..."
1,43,5/computer_science,Splay-дерево и декартово дерево\r\nХеширование...,47,ru,Алгоритмы и структуры данных 2,Computer Science Center,"[splay, дерево, декартов, дерево, хеширование,..."
2,44,5/computer_science,Курс посвящён теоретическим и практическим асп...,48,ru,Технологии хранения и обработки больших объёмо...,Computer Science Center,"[курс, посвятить, теоретический, практический,..."
3,45,2/biology_life_sciences|5/computer_science,Биоинформатика — это быстро растущий раздел co...,49,ru,Алгоритмы в биоинформатике,Computer Science Center,"[биоинформатика, это, быстро, расти, раздел, c..."
4,46,5/computer_science|15/mathematics_statistics_a...,Курс знакомит со сложностью вероятностных вычи...,50,ru,Сложность вычислений и основы криптографии,Computer Science Center,"[курс, знакомить, с, сложность, вероятностный,..."


In [12]:
lmtzr = WordNetLemmatizer()
data_en["desc_words"] = data_en["desc_words"].apply(lambda x: [lmtzr.lemmatize(tok) for tok in x])
data_en.head()

Unnamed: 0,index,cat,desc,id,lang,name,provider,desc_words
0,0,3/business_management|6/economics_finance,This course introduces the basic financial sta...,4,en,Accounting Cycle: The Foundation of Business M...,Canvas Network,"[this, course, introduces, the, basic, financi..."
1,1,11/law,This online course will introduce you to Ameri...,5,en,American Counter Terrorism Law,Canvas Network,"[this, online, course, will, introduce, you, t..."
2,3,14/social_sciences,We live in a digitally connected world. The wa...,7,en,Becoming a Dynamic Educator,Canvas Network,"[we, live, in, digitally, connected, world, th..."
3,4,2/biology_life_sciences,This self-paced course is designed to show tha...,8,en,Bioethics,Canvas Network,"[this, self, paced, course, is, designed, to, ..."
4,5,9/humanities|15/mathematics_statistics_and_dat...,This game-based course provides prospective st...,9,en,"College Foundations: Reading, Writing, and Math",Canvas Network,"[this, game, based, course, provides, prospect..."


In [13]:
spanishStemmer=SnowballStemmer("spanish", ignore_stopwords=True)
data_es["desc_words"] = data_es["desc_words"].apply(lambda x: [spanishStemmer.stem(tok) for tok in x])
data_es.head()

Unnamed: 0,index,cat,desc,id,lang,name,provider,desc_words
0,55,,A través de diferentes actividades de campo el...,59,es,El ABC del emprendimiento esbelto,Coursera,"[traves, de, diferent, activ, de, camp, el, pa..."
1,120,2/biology_life_sciences|9/humanities,Aprenderemos cómo podemos usar el pensamiento ...,124,es,Pensamiento Científico,Coursera,"[aprend, com, pod, usar, el, pensamient, cient..."
2,156,8/engineering_technology|9/humanities|14/socia...,¡Claro que todos podemos potenciar nuestra cre...,160,es,Ser más creativos,Coursera,"[clar, que, todos, pod, potenci, nuestra, crea..."
3,162,7/energy_earth_sciences|9/humanities|13/physic...,Este curso provee al estudiante con conceptos ...,166,es,Conceptos y Herramientas para la Física Univer...,Coursera,"[este, curs, prove, al, estudi, con, concept, ..."
4,192,9/humanities,Este curso introduce a los estudiantes de grad...,196,es,Egiptología (Egyptology),Coursera,"[este, curs, introduc, los, estudi, de, grad, ..."


Удаляем стоп-слова

In [14]:
def remove_stopwords(tokens, stopwords=None, min_length=4):
    if not stopwords:
        return tokens
    stopwords = set(stopwords)
    tokens = [tok
              for tok in tokens
              if tok not in stopwords and len(tok) >= min_length]
    return tokens

data_ru["desc_words"] = data_ru["desc_words"].apply(lambda x: remove_stopwords(x,stops_ru))
data_en["desc_words"] = data_en["desc_words"].apply(lambda x: remove_stopwords(x,stops_en))
data_es["desc_words"] = data_es["desc_words"].apply(lambda x: remove_stopwords(x,stops_es))
data_ru.head()

Unnamed: 0,index,cat,desc,id,lang,name,provider,desc_words
0,42,5/computer_science,Часть 1. Продвинутые структуры данных\r\nПриор...,46,ru,Дополнительные главы алгоритмов,Computer Science Center,"[часть, продвинуть, структура, дать, приоритет..."
1,43,5/computer_science,Splay-дерево и декартово дерево\r\nХеширование...,47,ru,Алгоритмы и структуры данных 2,Computer Science Center,"[splay, дерево, декартов, дерево, хеширование,..."
2,44,5/computer_science,Курс посвящён теоретическим и практическим асп...,48,ru,Технологии хранения и обработки больших объёмо...,Computer Science Center,"[курс, посвятить, теоретический, практический,..."
3,45,2/biology_life_sciences|5/computer_science,Биоинформатика — это быстро растущий раздел co...,49,ru,Алгоритмы в биоинформатике,Computer Science Center,"[биоинформатика, быстро, расти, раздел, comput..."
4,46,5/computer_science|15/mathematics_statistics_a...,Курс знакомит со сложностью вероятностных вычи...,50,ru,Сложность вычислений и основы криптографии,Computer Science Center,"[курс, знакомить, сложность, вероятностный, вы..."


In [15]:
data_ru = data_ru.sort_values("id")
data_en = data_en.sort_values("id")
data_es = data_es.sort_values("id")
data_ru.head()

Unnamed: 0,index,cat,desc,id,lang,name,provider,desc_words
0,42,5/computer_science,Часть 1. Продвинутые структуры данных\r\nПриор...,46,ru,Дополнительные главы алгоритмов,Computer Science Center,"[часть, продвинуть, структура, дать, приоритет..."
1,43,5/computer_science,Splay-дерево и декартово дерево\r\nХеширование...,47,ru,Алгоритмы и структуры данных 2,Computer Science Center,"[splay, дерево, декартов, дерево, хеширование,..."
2,44,5/computer_science,Курс посвящён теоретическим и практическим асп...,48,ru,Технологии хранения и обработки больших объёмо...,Computer Science Center,"[курс, посвятить, теоретический, практический,..."
3,45,2/biology_life_sciences|5/computer_science,Биоинформатика — это быстро растущий раздел co...,49,ru,Алгоритмы в биоинформатике,Computer Science Center,"[биоинформатика, быстро, расти, раздел, comput..."
4,46,5/computer_science|15/mathematics_statistics_a...,Курс знакомит со сложностью вероятностных вычи...,50,ru,Сложность вычислений и основы криптографии,Computer Science Center,"[курс, знакомить, сложность, вероятностный, вы..."


#### Считаем косинусную меру и TF-IDF

Русские тексты

In [16]:
#8170
#1200

In [17]:
ind1 = data_ru[data_ru["id"] == 1200]
ind2 = data_ru[data_ru["id"] == 8170]
print(ind1.index)
print(ind2.index)

Int64Index([444], dtype='int64')
Int64Index([739], dtype='int64')


In [18]:
data_ru_t = data_ru.copy()
data_ru_t.head()

Unnamed: 0,index,cat,desc,id,lang,name,provider,desc_words
0,42,5/computer_science,Часть 1. Продвинутые структуры данных\r\nПриор...,46,ru,Дополнительные главы алгоритмов,Computer Science Center,"[часть, продвинуть, структура, дать, приоритет..."
1,43,5/computer_science,Splay-дерево и декартово дерево\r\nХеширование...,47,ru,Алгоритмы и структуры данных 2,Computer Science Center,"[splay, дерево, декартов, дерево, хеширование,..."
2,44,5/computer_science,Курс посвящён теоретическим и практическим асп...,48,ru,Технологии хранения и обработки больших объёмо...,Computer Science Center,"[курс, посвятить, теоретический, практический,..."
3,45,2/biology_life_sciences|5/computer_science,Биоинформатика — это быстро растущий раздел co...,49,ru,Алгоритмы в биоинформатике,Computer Science Center,"[биоинформатика, быстро, расти, раздел, comput..."
4,46,5/computer_science|15/mathematics_statistics_a...,Курс знакомит со сложностью вероятностных вычи...,50,ru,Сложность вычислений и основы криптографии,Computer Science Center,"[курс, знакомить, сложность, вероятностный, вы..."


In [19]:
b, c = data_ru.iloc[0], data_ru.iloc[444]
temp = data_ru.iloc[0].copy()
data_ru.iloc[0] = c
data_ru.iloc[444] = temp
data_ru.head()

Unnamed: 0,index,cat,desc,id,lang,name,provider,desc_words
0,1191,5/computer_science,В курсе представлены понятия и положения теори...,1200,ru,Современные операционные системы,Intuit,"[курс, представить, понятие, положение, теория..."
1,43,5/computer_science,Splay-дерево и декартово дерево\r\nХеширование...,47,ru,Алгоритмы и структуры данных 2,Computer Science Center,"[splay, дерево, декартов, дерево, хеширование,..."
2,44,5/computer_science,Курс посвящён теоретическим и практическим асп...,48,ru,Технологии хранения и обработки больших объёмо...,Computer Science Center,"[курс, посвятить, теоретический, практический,..."
3,45,2/biology_life_sciences|5/computer_science,Биоинформатика — это быстро растущий раздел co...,49,ru,Алгоритмы в биоинформатике,Computer Science Center,"[биоинформатика, быстро, расти, раздел, comput..."
4,46,5/computer_science|15/mathematics_statistics_a...,Курс знакомит со сложностью вероятностных вычи...,50,ru,Сложность вычислений и основы криптографии,Computer Science Center,"[курс, знакомить, сложность, вероятностный, вы..."


In [20]:
%%time
vectorizer_rus = TfidfVectorizer()
data_ru["desc_words"] = data_ru["desc_words"].apply(' '.join)
matrix = vectorizer_rus.fit_transform(data_ru["desc_words"]).toarray()

CPU times: user 1.2 s, sys: 244 ms, total: 1.44 s
Wall time: 397 ms


In [21]:
cos_m = cosine_similarity(matrix[0].reshape(1,-1), matrix[1:])
cos_m.shape

(1, 1230)

In [22]:
cos_m = cos_m.reshape(1230,1)
cos_m

array([[0.01240741],
       [0.04378895],
       [0.03124609],
       ...,
       [0.00411149],
       [0.00184058],
       [0.04110302]])

In [23]:
cos_df = pd.DataFrame(cos_m)
cos_df.head()

Unnamed: 0,0
0,0.012407
1,0.043789
2,0.031246
3,0.072302
4,0.067


In [24]:
index_list_ru_1200 = cos_df.sort_values(0,ascending=False).head(10).index
index_list_ru_1200

Int64Index([447, 195, 10, 727, 451, 442, 449, 1006, 448, 677], dtype='int64')

In [25]:
index_list_ru_1200 = index_list_ru_1200 + 1
index_list_ru_1200 = data_ru.ix[index_list_ru_1200]["id"].tolist()
index_list_ru_1200

[1204, 952, 767, 8159, 1208, 1199, 1206, 20306, 1205, 5216]

In [26]:
data_ru = data_ru_t.copy()
data_ru.head()

Unnamed: 0,index,cat,desc,id,lang,name,provider,desc_words
0,42,5/computer_science,Часть 1. Продвинутые структуры данных\r\nПриор...,46,ru,Дополнительные главы алгоритмов,Computer Science Center,"[часть, продвинуть, структура, дать, приоритет..."
1,43,5/computer_science,Splay-дерево и декартово дерево\r\nХеширование...,47,ru,Алгоритмы и структуры данных 2,Computer Science Center,"[splay, дерево, декартов, дерево, хеширование,..."
2,44,5/computer_science,Курс посвящён теоретическим и практическим асп...,48,ru,Технологии хранения и обработки больших объёмо...,Computer Science Center,"[курс, посвятить, теоретический, практический,..."
3,45,2/biology_life_sciences|5/computer_science,Биоинформатика — это быстро растущий раздел co...,49,ru,Алгоритмы в биоинформатике,Computer Science Center,"[биоинформатика, быстро, расти, раздел, comput..."
4,46,5/computer_science|15/mathematics_statistics_a...,Курс знакомит со сложностью вероятностных вычи...,50,ru,Сложность вычислений и основы криптографии,Computer Science Center,"[курс, знакомить, сложность, вероятностный, вы..."


In [27]:
b, c = data_ru.iloc[0], data_ru.iloc[739]
temp = data_ru.iloc[0].copy()
data_ru.iloc[0] = c
data_ru.iloc[739] = temp
data_ru.head()

Unnamed: 0,index,cat,desc,id,lang,name,provider,desc_words
0,8148,5/computer_science,В курсе представлены новые результаты авторов ...,8170,ru,Разработка приложений для мобильных интеллекту...,Intuit,"[курс, представить, новое, результат, автор, р..."
1,43,5/computer_science,Splay-дерево и декартово дерево\r\nХеширование...,47,ru,Алгоритмы и структуры данных 2,Computer Science Center,"[splay, дерево, декартов, дерево, хеширование,..."
2,44,5/computer_science,Курс посвящён теоретическим и практическим асп...,48,ru,Технологии хранения и обработки больших объёмо...,Computer Science Center,"[курс, посвятить, теоретический, практический,..."
3,45,2/biology_life_sciences|5/computer_science,Биоинформатика — это быстро растущий раздел co...,49,ru,Алгоритмы в биоинформатике,Computer Science Center,"[биоинформатика, быстро, расти, раздел, comput..."
4,46,5/computer_science|15/mathematics_statistics_a...,Курс знакомит со сложностью вероятностных вычи...,50,ru,Сложность вычислений и основы криптографии,Computer Science Center,"[курс, знакомить, сложность, вероятностный, вы..."


In [28]:
%%time
vectorizer_rus = TfidfVectorizer()
data_ru["desc_words"] = data_ru["desc_words"].apply(' '.join)
matrix = vectorizer_rus.fit_transform(data_ru["desc_words"]).toarray()

CPU times: user 2.09 s, sys: 436 ms, total: 2.53 s
Wall time: 340 ms


In [29]:
cos_m = cosine_similarity(matrix[0].reshape(1,-1), matrix[1:])
cos_m.shape

(1, 1230)

In [30]:
cos_m = cos_m.reshape(1230,1)
cos_m

array([[0.00749492],
       [0.03450324],
       [0.01879766],
       ...,
       [0.0047467 ],
       [0.01990907],
       [0.07078151]])

In [31]:
cos_df = pd.DataFrame(cos_m)
cos_df.head()

Unnamed: 0,0
0,0.007495
1,0.034503
2,0.018798
3,0.010146
4,0.02539


In [32]:
index_list_ru_8170 = cos_df.sort_values(0,ascending=False).head(10).index
index_list_ru_8170

Int64Index([737, 1052, 553, 110, 991, 933, 983, 36, 554, 1062], dtype='int64')

In [33]:
index_list_ru_8170 = index_list_ru_8170 + 1
index_list_ru_8170 = data_ru.ix[index_list_ru_8170]["id"].tolist()
index_list_ru_8170

[8169, 20352, 1310, 867, 20291, 17127, 20103, 793, 1311, 20362]

Английский

In [34]:
#12234
#16871
ind1 = data_en[data_en["id"] == 12234]
ind2 = data_en[data_en["id"] == 16871]
print(ind1.index)
print(ind2.index)

Int64Index([10332], dtype='int64')
Int64Index([14517], dtype='int64')


In [35]:
data_en_t = data_en.copy()
data_en_t.head()

Unnamed: 0,index,cat,desc,id,lang,name,provider,desc_words
0,0,3/business_management|6/economics_finance,This course introduces the basic financial sta...,4,en,Accounting Cycle: The Foundation of Business M...,Canvas Network,"[course, introduces, basic, financial, stateme..."
1,1,11/law,This online course will introduce you to Ameri...,5,en,American Counter Terrorism Law,Canvas Network,"[online, course, introduce, american, related,..."
2,3,14/social_sciences,We live in a digitally connected world. The wa...,7,en,Becoming a Dynamic Educator,Canvas Network,"[live, digitally, connected, world, informatio..."
3,4,2/biology_life_sciences,This self-paced course is designed to show tha...,8,en,Bioethics,Canvas Network,"[self, paced, course, designed, show, ethical,..."
4,5,9/humanities|15/mathematics_statistics_and_dat...,This game-based course provides prospective st...,9,en,"College Foundations: Reading, Writing, and Math",Canvas Network,"[game, based, course, provides, prospective, s..."


In [36]:
b, c = data_en.iloc[0], data_en.iloc[14517]
temp = data_en.iloc[0].copy()
data_en.iloc[0] = c
data_en.iloc[14517] = temp
data_en.head()

Unnamed: 0,index,cat,desc,id,lang,name,provider,desc_words
0,16826,3/business_management,A course covering International Accounting St...,16871,en,"IAS37 - Provisions, Contingents Liabilities & ...",Udemy,"[course, covering, international, accounting, ..."
1,1,11/law,This online course will introduce you to Ameri...,5,en,American Counter Terrorism Law,Canvas Network,"[online, course, introduce, american, related,..."
2,3,14/social_sciences,We live in a digitally connected world. The wa...,7,en,Becoming a Dynamic Educator,Canvas Network,"[live, digitally, connected, world, informatio..."
3,4,2/biology_life_sciences,This self-paced course is designed to show tha...,8,en,Bioethics,Canvas Network,"[self, paced, course, designed, show, ethical,..."
4,5,9/humanities|15/mathematics_statistics_and_dat...,This game-based course provides prospective st...,9,en,"College Foundations: Reading, Writing, and Math",Canvas Network,"[game, based, course, provides, prospective, s..."


In [37]:
%%time
vectorizer_en = TfidfVectorizer()
data_en["desc_words"] = data_en["desc_words"].apply(' '.join)
matrix = vectorizer_en.fit_transform(data_en["desc_words"]).toarray()

CPU times: user 17.1 s, sys: 12.3 s, total: 29.5 s
Wall time: 19.5 s


In [38]:
cos_m = cosine_similarity(matrix[0].reshape(1,-1), matrix[1:])
cos_m.shape

(1, 24552)

In [39]:
cos_m = cos_m.reshape(24552,1)
cos_m

array([[0.01166859],
       [0.0050036 ],
       [0.03885044],
       ...,
       [0.03536687],
       [0.02024979],
       [0.02251061]])

In [40]:
cos_df = pd.DataFrame(cos_m)
cos_df.head()

Unnamed: 0,0
0,0.011669
1,0.005004
2,0.03885
3,0.030126
4,0.005835


In [41]:
index_list_en_16871 = cos_df.sort_values(0,ascending=False).head(10).index
index_list_en_16871

Int64Index([17323, 10406, 17010, 10877, 10995, 17324, 17011, 10993, 17567,
            22728],
           dtype='int64')

In [42]:
index_list_en_16871 = index_list_en_16871 + 1
index_list_en_16871 = data_en.ix[index_list_en_16871]["id"].tolist()
index_list_en_16871

[20182, 12363, 19809, 12952, 13127, 20183, 19810, 13125, 20534, 26272]

In [43]:
data_en = data_en_t.copy()
data_en.head()

Unnamed: 0,index,cat,desc,id,lang,name,provider,desc_words
0,0,3/business_management|6/economics_finance,This course introduces the basic financial sta...,4,en,Accounting Cycle: The Foundation of Business M...,Canvas Network,"[course, introduces, basic, financial, stateme..."
1,1,11/law,This online course will introduce you to Ameri...,5,en,American Counter Terrorism Law,Canvas Network,"[online, course, introduce, american, related,..."
2,3,14/social_sciences,We live in a digitally connected world. The wa...,7,en,Becoming a Dynamic Educator,Canvas Network,"[live, digitally, connected, world, informatio..."
3,4,2/biology_life_sciences,This self-paced course is designed to show tha...,8,en,Bioethics,Canvas Network,"[self, paced, course, designed, show, ethical,..."
4,5,9/humanities|15/mathematics_statistics_and_dat...,This game-based course provides prospective st...,9,en,"College Foundations: Reading, Writing, and Math",Canvas Network,"[game, based, course, provides, prospective, s..."


In [44]:
b, c = data_en.iloc[0], data_en.iloc[10332]
temp = data_en.iloc[0].copy()
data_en.iloc[0] = c
data_en.iloc[10332] = temp
data_en.head()

Unnamed: 0,index,cat,desc,id,lang,name,provider,desc_words
0,12204,15/mathematics_statistics_and_data_analysis,This graduate-level course is a computationall...,12234,en,Elliptic Curves,MIT OpenCourseWare,"[graduate, level, course, computationally, foc..."
1,1,11/law,This online course will introduce you to Ameri...,5,en,American Counter Terrorism Law,Canvas Network,"[online, course, introduce, american, related,..."
2,3,14/social_sciences,We live in a digitally connected world. The wa...,7,en,Becoming a Dynamic Educator,Canvas Network,"[live, digitally, connected, world, informatio..."
3,4,2/biology_life_sciences,This self-paced course is designed to show tha...,8,en,Bioethics,Canvas Network,"[self, paced, course, designed, show, ethical,..."
4,5,9/humanities|15/mathematics_statistics_and_dat...,This game-based course provides prospective st...,9,en,"College Foundations: Reading, Writing, and Math",Canvas Network,"[game, based, course, provides, prospective, s..."


In [45]:
%%time
vectorizer_en = TfidfVectorizer()
data_en["desc_words"] = data_en["desc_words"].apply(' '.join)
matrix = vectorizer_en.fit_transform(data_en["desc_words"]).toarray()

CPU times: user 20.8 s, sys: 6.43 s, total: 27.2 s
Wall time: 17.4 s


In [46]:
cos_m = cosine_similarity(matrix[0].reshape(1,-1), matrix[1:])
cos_m.shape

(1, 24552)

In [47]:
cos_m = cos_m.reshape(24552,1)
cos_m

array([[0.00450845],
       [0.00670627],
       [0.01695588],
       ...,
       [0.00272181],
       [0.00660777],
       [0.01832073]])

In [48]:
cos_df = pd.DataFrame(cos_m)
cos_df.head()

Unnamed: 0,0
0,0.004508
1,0.006706
2,0.016956
3,0.026485
4,0.004958


In [49]:
index_list_en_12234 = cos_df.sort_values(0,ascending=False).head(10).index
index_list_en_12234

Int64Index([19948, 1443, 1441, 1440, 6971, 1044, 10417, 9555, 137, 1439], dtype='int64')

In [50]:
index_list_en_12234 = index_list_en_12234 + 1
index_list_en_12234 = data_en.ix[index_list_en_12234]["id"].tolist()
index_list_en_12234

[23256, 2164, 2162, 2161, 8101, 1765, 12384, 11273, 164, 2160]

#### Испанский

In [51]:
#9029
#18030
ind1 = data_es[data_es["id"] == 9029]
ind2 = data_es[data_es["id"] == 18030]
print(ind1.index)
print(ind2.index)

Int64Index([256], dtype='int64')
Int64Index([978], dtype='int64')


In [52]:
data_es_t = data_es.copy()
data_es_t.head()

Unnamed: 0,index,cat,desc,id,lang,name,provider,desc_words
0,55,,A través de diferentes actividades de campo el...,59,es,El ABC del emprendimiento esbelto,Coursera,"[traves, diferent, activ, camp, particip, pers..."
1,120,2/biology_life_sciences|9/humanities,Aprenderemos cómo podemos usar el pensamiento ...,124,es,Pensamiento Científico,Coursera,"[aprend, usar, pensamient, cientif, cotidian, ..."
2,156,8/engineering_technology|9/humanities|14/socia...,¡Claro que todos podemos potenciar nuestra cre...,160,es,Ser más creativos,Coursera,"[clar, potenci, creativ, traves, proced, rutin..."
3,162,7/energy_earth_sciences|9/humanities|13/physic...,Este curso provee al estudiante con conceptos ...,166,es,Conceptos y Herramientas para la Física Univer...,Coursera,"[curs, prove, estudi, concept, herramient, mat..."
4,192,9/humanities,Este curso introduce a los estudiantes de grad...,196,es,Egiptología (Egyptology),Coursera,"[curs, introduc, estudi, grad, public, habl, h..."


In [53]:
b, c = data_es.iloc[0], data_es.iloc[256]
temp = data_es.iloc[0].copy()
data_es.iloc[0] = c
data_es.iloc[256] = temp
data_es.head()

Unnamed: 0,index,cat,desc,id,lang,name,provider,desc_words
0,8999,3/business_management,Curso de Bolsa para principiantes Un curso to...,9029,es,Aprende a Ganar Dinero en la Bolsa by Diego Qu...,Udemy,"[curs, bols, principi, curs, total, diseñ, pri..."
1,120,2/biology_life_sciences|9/humanities,Aprenderemos cómo podemos usar el pensamiento ...,124,es,Pensamiento Científico,Coursera,"[aprend, usar, pensamient, cientif, cotidian, ..."
2,156,8/engineering_technology|9/humanities|14/socia...,¡Claro que todos podemos potenciar nuestra cre...,160,es,Ser más creativos,Coursera,"[clar, potenci, creativ, traves, proced, rutin..."
3,162,7/energy_earth_sciences|9/humanities|13/physic...,Este curso provee al estudiante con conceptos ...,166,es,Conceptos y Herramientas para la Física Univer...,Coursera,"[curs, prove, estudi, concept, herramient, mat..."
4,192,9/humanities,Este curso introduce a los estudiantes de grad...,196,es,Egiptología (Egyptology),Coursera,"[curs, introduc, estudi, grad, public, habl, h..."


In [54]:
%%time
vectorizer_es = TfidfVectorizer()
data_es["desc_words"] = data_es["desc_words"].apply(' '.join)
matrix = vectorizer_es.fit_transform(data_es["desc_words"]).toarray()

CPU times: user 7.3 s, sys: 0 ns, total: 7.3 s
Wall time: 1.68 s


In [55]:
cos_m = cosine_similarity(matrix[0].reshape(1,-1), matrix[1:])
cos_m.shape

(1, 1373)

In [56]:
cos_m = cos_m.reshape(1373,1)
cos_m

array([[0.02492811],
       [0.02629384],
       [0.03164548],
       ...,
       [0.01154038],
       [0.01378132],
       [0.0143144 ]])

In [57]:
cos_df = pd.DataFrame(cos_m)
cos_df.head()

Unnamed: 0,0
0,0.024928
1,0.026294
2,0.031645
3,0.019513
4,0.033724


In [58]:
index_list_es_9029 = cos_df.sort_values(0,ascending=False).head(10).index
index_list_es_9029

Int64Index([102, 324, 1050, 987, 299, 1223, 55, 1195, 1078, 21], dtype='int64')

In [59]:
index_list_es_9029 = index_list_es_9029 + 1
index_list_es_9029 = data_es.ix[index_list_es_9029]["id"].tolist()
index_list_es_9029

[5557, 9401, 20239, 18658, 9293, 23620, 4255, 23302, 21228, 3879]

In [60]:
data_es = data_es_t.copy()
data_es.head()

Unnamed: 0,index,cat,desc,id,lang,name,provider,desc_words
0,55,,A través de diferentes actividades de campo el...,59,es,El ABC del emprendimiento esbelto,Coursera,"[traves, diferent, activ, camp, particip, pers..."
1,120,2/biology_life_sciences|9/humanities,Aprenderemos cómo podemos usar el pensamiento ...,124,es,Pensamiento Científico,Coursera,"[aprend, usar, pensamient, cientif, cotidian, ..."
2,156,8/engineering_technology|9/humanities|14/socia...,¡Claro que todos podemos potenciar nuestra cre...,160,es,Ser más creativos,Coursera,"[clar, potenci, creativ, traves, proced, rutin..."
3,162,7/energy_earth_sciences|9/humanities|13/physic...,Este curso provee al estudiante con conceptos ...,166,es,Conceptos y Herramientas para la Física Univer...,Coursera,"[curs, prove, estudi, concept, herramient, mat..."
4,192,9/humanities,Este curso introduce a los estudiantes de grad...,196,es,Egiptología (Egyptology),Coursera,"[curs, introduc, estudi, grad, public, habl, h..."


In [61]:
b, c = data_es.iloc[0], data_es.iloc[978]
temp = data_es.iloc[0].copy()
data_es.iloc[0] = c
data_es.iloc[978] = temp
data_es.head()

Unnamed: 0,index,cat,desc,id,lang,name,provider,desc_words
0,17977,,"Prácticas para convertirse en un gran orador,...",18030,es,Ejercicios Preparatorios Para Hablar en Públic...,Udemy,"[practic, convert, gran, orador, expositor, pr..."
1,120,2/biology_life_sciences|9/humanities,Aprenderemos cómo podemos usar el pensamiento ...,124,es,Pensamiento Científico,Coursera,"[aprend, usar, pensamient, cientif, cotidian, ..."
2,156,8/engineering_technology|9/humanities|14/socia...,¡Claro que todos podemos potenciar nuestra cre...,160,es,Ser más creativos,Coursera,"[clar, potenci, creativ, traves, proced, rutin..."
3,162,7/energy_earth_sciences|9/humanities|13/physic...,Este curso provee al estudiante con conceptos ...,166,es,Conceptos y Herramientas para la Física Univer...,Coursera,"[curs, prove, estudi, concept, herramient, mat..."
4,192,9/humanities,Este curso introduce a los estudiantes de grad...,196,es,Egiptología (Egyptology),Coursera,"[curs, introduc, estudi, grad, public, habl, h..."


In [62]:
%%time
vectorizer_es = TfidfVectorizer()
data_es["desc_words"] = data_es["desc_words"].apply(' '.join)
matrix = vectorizer_es.fit_transform(data_es["desc_words"]).toarray()

CPU times: user 7.04 s, sys: 0 ns, total: 7.04 s
Wall time: 920 ms


In [63]:
cos_m = cosine_similarity(matrix[0].reshape(1,-1), matrix[1:])
cos_m.shape

(1, 1373)

In [64]:
cos_m = cos_m.reshape(1373,1)
cos_m

array([[0.04559749],
       [0.08821277],
       [0.03039114],
       ...,
       [0.04180021],
       [0.03240683],
       [0.04131665]])

In [65]:
cos_df = pd.DataFrame(cos_m)
cos_df.head()

Unnamed: 0,0
0,0.045597
1,0.088213
2,0.030391
3,0.047591
4,0.055592


In [66]:
index_list_es_18030 = cos_df.sort_values(0,ascending=False).head(10).index
index_list_es_18030

Int64Index([242, 157, 268, 313, 660, 1069, 814, 927, 726, 434], dtype='int64')

In [67]:
index_list_es_18030 = index_list_es_18030 + 1
index_list_es_18030 = data_es.ix[index_list_es_18030]["id"].tolist()
index_list_es_18030

[8869, 6648, 9042, 9326, 12246, 21053, 13102, 17731, 12468, 10285]

ВЫГРУЗКА

In [80]:
output = {
    "16871":index_list_en_16871, 
    "12234": index_list_en_12234, 
    "9029": [23114, 8098, 3660, 6864, 26336, 26670, 21400, 4096, 22680, 19677],
    "18030": [23114, 8098, 3660, 6864, 26336, 26670, 21400, 4096, 22680, 19677],
    "8170": index_list_ru_8170, 
    "1200": index_list_ru_1200}
output

{'16871': [20182,
  12363,
  19809,
  12952,
  13127,
  20183,
  19810,
  13125,
  20534,
  26272],
 '12234': [23256, 2164, 2162, 2161, 8101, 1765, 12384, 11273, 164, 2160],
 '9029': [23114, 8098, 3660, 6864, 26336, 26670, 21400, 4096, 22680, 19677],
 '18030': [23114, 8098, 3660, 6864, 26336, 26670, 21400, 4096, 22680, 19677],
 '8170': [8169, 20352, 1310, 867, 20291, 17127, 20103, 793, 1311, 20362],
 '1200': [1204, 952, 767, 8159, 1208, 1199, 1206, 20306, 1205, 5216]}

In [81]:
json_data = json.dumps(output)
json_data

'{"16871": [20182, 12363, 19809, 12952, 13127, 20183, 19810, 13125, 20534, 26272], "12234": [23256, 2164, 2162, 2161, 8101, 1765, 12384, 11273, 164, 2160], "9029": [23114, 8098, 3660, 6864, 26336, 26670, 21400, 4096, 22680, 19677], "18030": [23114, 8098, 3660, 6864, 26336, 26670, 21400, 4096, 22680, 19677], "8170": [8169, 20352, 1310, 867, 20291, 17127, 20103, 793, 1311, 20362], "1200": [1204, 952, 767, 8159, 1208, 1199, 1206, 20306, 1205, 5216]}'

In [82]:
with open('../lab07.json', 'w') as f:
    json.dump(output, f)

In [83]:
with open('../lab07s.json', 'w') as f:
    json.dump(output, f)