In [2]:
import numpy as np
import pandas as pd
import multiprocessing as mp

np.random.seed(0)

In [3]:
df = pd.read_csv('data/medium.csv', nrows=1000)
prep_df = pd.read_csv('data/prep_df.csv').applymap(str)

In [3]:
df.head(3)

Unnamed: 0,recommends,subTitle,text,title
0,2,A major private IT company implements blockcha...,"Private Business, Government and Blockchain\n\...","Private Business, Government and Blockchain"
1,0,Introduction,EPQ draft 1 (4844 words)\nhttps://upload.wikim...,EPQ draft 1 (4844 words)
2,0,Various associations in the present days are o...,"Ascent of data Science, SAS and Big data Analy...","Ascent of data Science, SAS and Big data Analy..."


In [4]:
prep_df.head(3)

Unnamed: 0,title,subTitle,text
0,private business government blockchain,major private company implement blockchain art...,private business government blockchain major p...
1,epq draft word,introduction,epq draft word introduction automation set une...
2,ascent data science sa big data analyst traini...,various association present day open entryways...,ascent data science sa big data analyst traini...


In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
import scipy

for_title = TfidfVectorizer(min_df=2)
for_subTitle = TfidfVectorizer(min_df=3)
for_text = TfidfVectorizer(min_df=5)

tfidf_df = scipy.sparse.hstack([
    for_title.fit_transform(prep_df['title']),
    for_subTitle.fit_transform(prep_df['subTitle']),
    for_text.fit_transform(prep_df['text']),
]).tocsr()

In [6]:
import pickle
from pathlib import Path
Path("models").mkdir(exist_ok=True)

def dump(model, in_str):
    with open('models/' + in_str + '.pickle', 'wb') as handle:
        pickle.dump(model, handle, protocol=pickle.HIGHEST_PROTOCOL)

dump(for_title, 'for_title')
dump(for_subTitle, 'for_subTitle')
dump(for_text, 'for_text')

In [7]:
from encoders import text_to_vec

def app_text_to_vec(X):
    X = X.copy()
    col_names = ['title', 'subTitle', 'text']
    col_indexed = [[col + '_' + str(i) for i in range(300)] 
                   for col in col_names]

    with mp.Pool(6) as pool:
        dfs = []
        for i in range(len(col_names)):
            dfs.append(pd.DataFrame(
                pool.map(text_to_vec, X[col_names[i]]), 
                index=X.index, 
                columns=col_indexed[i]
            ))      
       
    return pd.concat(dfs, axis=1)

In [8]:
w2v_df = app_text_to_vec(prep_df)

In [9]:
from scipy.sparse import save_npz
save_npz('data/tfidf_df.npz', tfidf_df)
w2v_df.to_csv('data/w2v_df.csv', index=False)

In [10]:
def get_shape(tpl):
    return list(map(lambda x: x.shape[1], tpl))

def split_vec(vec, splits):
    return (vec[:splits[0]],
            vec[splits[0]:splits[0] + splits[1]],
            vec[splits[0] + splits[1]:])


In [11]:
from search import *

In [12]:
a = np.ones(10)
b = np.zeros(10)
b[:5] += 1
a, b

(array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]),
 array([1., 1., 1., 1., 1., 0., 0., 0., 0., 0.]))

In [13]:
cs(a, b)

0.7071067811865475

In [14]:
get_shape(tfidf_encode_query('machine learning'))

[11833, 11857, 63844]

In [15]:
doc = Document('machine learning', 'machine learning', 'machine learning', 5)
score('machine learning', doc)

1.0

In [10]:
from search import *

In [11]:
index._intersect_indices([
    index.word_bank['facebook'],
    index.word_bank['leak']
])

array([  653,   903,  1049,  1078,  2837,  3163,  3194,  3565,  3881,
        4660,  4885,  5047,  6399,  6672,  6818,  7325,  7359,  7431,
        7614,  7635,  7776,  8494,  8773,  9022,  9345,  9814, 10915,
       10952, 11870, 12153, 12181, 13048, 13724, 14494, 14867, 15894,
       16396, 16599, 17346, 17621, 18627, 19592, 19961, 21403, 21433,
       21877, 22350, 22379, 22428, 23142, 23240, 23538, 24484, 24863,
       24987, 25113, 25321, 26335, 26805, 27160, 27199, 28617, 29018,
       29326, 29370, 30193, 30289, 30423, 30458, 30541, 33175, 34101,
       34224, 34333, 34451, 34685, 35455, 35947, 35980, 36117, 36158,
       36348, 36393, 37299, 38113, 38224, 38398, 39087, 39112, 39147,
       39333, 39364, 39789, 39968, 40633, 40912, 41670, 41714, 41858,
       42323, 42645, 42654, 43686, 43719, 44231, 44299, 44865, 45275,
       45803, 46049, 46248, 47485, 47491, 47676, 48332, 48887, 49167,
       50273, 50285, 50293, 50568, 50893, 51267, 51872, 51954, 52162,
       53001, 54935,

In [12]:
ind = index._intersect_indices([
    index.word_bank['machine'],
    index.word_bank['learning']
])

In [14]:
df['recommends'][ind].sort_values(ascending=False).index[:100].sort_values()

Int64Index([ 1161,  2472,  3024,  3309,  4712,  5974,  7238,  9596, 14985,
            17177, 17686, 18083, 18470, 18553, 19788, 23336, 24251, 24575,
            25205, 25269, 25639, 29133, 31381, 33448, 34195, 34465, 34543,
            37454, 39891, 40945, 41329, 41584, 42645, 45934, 46451, 47654,
            50560, 51285, 52048, 54566, 54655, 54844, 54960, 55152, 55257,
            55466, 55526, 55735, 55822, 56053, 56835, 56985, 57050, 57168,
            57186, 57852, 57896, 58013, 58252, 58385, 58533, 58913, 59097,
            59237, 59426, 59640, 59703, 59837, 60056, 60291, 60337, 60671,
            60754, 61222, 61402, 61434, 61463, 61524, 61756, 61978, 62039,
            62741, 63051, 63355, 63402, 63493, 63746, 63988, 64225, 64264,
            64589, 64749, 64911, 65090, 65118, 65445, 65566, 65794, 66041,
            66129],
           dtype='int64')

In [8]:
df.iloc[32770, 2]

'How Could Emotionally Intelligent Computers Change Our Society?\nFig.1: Scene from Alex Garland’s ‘Ex Machina’\nI used to think that humans would always have one advantage over AI. From this article’s title you can probably guess that I thought the advantage was our understanding of emotions. The conversations we have with our friends, the hours we spend analysing their meaning or the reading of subtle facial cues: surely, these would be the things that humans would always be able to do better than machines. As it turns out this is not quite right. In fact, the field of affective computing or emotion AI, which focuses on developing computers’ ability to read human emotions, has been quietly flourishing since 1997 when Rosalind Pricard published her seminal paper on the subject and established the Affective Computing Group at MIT Media Lab. In the more sophisticated terms of the Group itself:\n‘Affective Computing is computing that relates to, arises from, or deliberately influences em

In [5]:
[title for title in df.title.tolist() if 'The Cultural Revolution: Robots and Trust' in title]

['The Cultural Revolution: Robots and Trust']

In [6]:
df.title.tolist().index('The Cultural Revolution: Robots and Trust')

11

In [9]:
df.iloc[11, 2]

'The Cultural Revolution: Robots and Trust\n\nIt is important to understand where we are to see what the future holds. We live in a time of hedonism, what people call a hookup culture. Of disposable relationships. The most likely outcome is for the hookup culture to evolve. To a society is a similar situation Japan finds itself now. With the commodification of relationships. People will burn out, once this has run its course. Humans will have to connect based on connection and the want for children, as everything else has been parsed for profit or commodified.\nThere are two things that have and will continue to hinder the hookup culture to this point, pregnancy and rape. While contraceptives have mitigated unwanted pregnancies, they are not fully effective. As to the latter subject, well, there likely isn’t a solution to that.\nSo what happens when we add robots?\nRobots and the Black Market\nIt is important to tackle this issue as this will be one of the main reasons for the introduc

In [15]:
def retrieve(query: str):
    # indices of documents with words from query
    if query == '':
        top = df.head(20)
    else:
        print(1)
        indices = build_indices(query)
        print(indices)

        print(2)
        query_tfidf, query_w2v = encode_query(query)
        # take a subset of documents
        print(2.5)
        sub_df_tfidf = pd.DataFrame(tfidf_df[indices].toarray())
        sub_df_w2v = w2v_df.iloc[indices, :]
        print(3)
        
        # calculate similarities
        sims = pd.Series(
            0.3 * sub_df_tfidf.apply(lambda x: score_halved(query_tfidf, x), axis=1) \
            + 0.7 * sub_df_w2v.apply(lambda x: score_halved(query_w2v, x), axis=1),
            index = indices)
        print(4)
    
        top = sims.sort_values(ascending = False).head(20)
        print('top:')
        print(top.index)
        print(5)

    return df_to_docs(df.iloc[top.index, :])

In [16]:
retrieve('machine learning')

1
Int64Index([25639,  4712, 54844, 58913, 33448, 17686, 55257, 34195, 18470,
            56835, 57896, 65445, 59097, 64589, 41329, 40945, 37454, 55735,
            62741, 52048, 41584, 17177, 24575, 61222, 64749, 14985,  3309,
            18083, 54655, 60671, 61756, 25205, 66129, 59703, 64264, 60337,
            47654, 59237, 64911, 66041, 60056, 63402, 25269, 56985, 34465,
            39891, 58533,  2472, 61524,  1161, 46451, 50560, 57168,  3024,
            65090, 61402,  5974, 65794, 19788, 31381, 63051, 51285, 59837,
            34543, 57050, 65566, 64225, 54960,  9596, 61463, 61434, 63746,
            55822, 60754, 60291, 55526, 54566, 55466, 18553, 23336, 63355,
            58013, 59426, 24251, 42645, 58252, 59640, 61978, 62039, 58385,
            56053, 65118, 63493, 57852, 45934, 55152, 63988, 57186, 29133,
             7238],
           dtype='int64')
2
2.5
3
4
top:
Int64Index([25639,  4712, 54844, 58913, 33448, 17686, 55257, 34195, 18470,
            56835, 57896, 65445, 5909

  dist = 1.0 - uv / np.sqrt(uu * vv)


[<document.Document at 0x7f2048776b00>,
 <document.Document at 0x7f200c7cc070>,
 <document.Document at 0x7f200c7cc0d0>,
 <document.Document at 0x7f200c7cc130>,
 <document.Document at 0x7f200c7cc190>,
 <document.Document at 0x7f200c7cc1f0>,
 <document.Document at 0x7f200c7cc250>,
 <document.Document at 0x7f200c7cc2b0>,
 <document.Document at 0x7f200c7cc310>,
 <document.Document at 0x7f200c7cc370>,
 <document.Document at 0x7f200c7cc3d0>,
 <document.Document at 0x7f200c7cc430>,
 <document.Document at 0x7f200c7cc490>,
 <document.Document at 0x7f200c7cc4f0>,
 <document.Document at 0x7f200c7cc550>,
 <document.Document at 0x7f200c7cc5b0>,
 <document.Document at 0x7f200c7cc610>,
 <document.Document at 0x7f200c7cc670>,
 <document.Document at 0x7f200c7cc6d0>,
 <document.Document at 0x7f200c7cc730>]