# Synopsis-to-Vector

In [1]:
import numpy as np
import pandas as pd
import pickle
import math

## Load Pre-trained CBOW Parameters

In [2]:
!wget -nc -q https://github.com/WegraLee/deep-learning-from-scratch-2/raw/master/ch04/cbow_params.pkl

In [3]:
pkl_file = 'cbow_params.pkl'

with open(pkl_file, 'rb') as f:
    params = pickle.load(f)
    word_vecs = params['word_vecs']
    word_to_id = params['word_to_id']
    id_to_word = params['id_to_word']

## Get TF-IDF from synopsis.csv

### Load Cleaned Synopsis Data

In [4]:
synopsis = pd.read_csv('../clean_data/synopsis.csv')
synopsis.head()

Unnamed: 0,MAL_ID,Synopsis
0,1,solar leaving surface planet earth solar polic...
1,5,day life crew routine interrupted chasing targ...
2,6,head reason waste oppose entire cities fun tit...
3,7,individuals powers mind control robin craft us...
4,8,dark century people suffering rule manipulate ...


### Get Word Set from Synopsis Data

In [5]:
syns = synopsis.Synopsis.str.findall('\w+')

vocab = set()
for syn in syns:
    vocab.update(syn)

vocab = sorted(list(vocab))
vocab_size = len(vocab)

### Get TF(Term Frequency)

In [6]:
def get_tf(word, syn):
    # Term Frequency
    return syn.count(word)

    
tf = []
for syn in syns:
    tf_syn = []
    for word in vocab:
        tf_syn.append(get_tf(word, syn))
    tf.append(tf_syn)

tf = np.asarray(tf)
tf.shape

(6000, 5918)

### Get IDF(Inverse Document Frequency)

In [7]:
def get_idf(term):
    # Inverse Document Frequency
    _df = 0
    for syn in syns:
        _df += int(word in syn)
    return math.log(vocab_size/(_df))


idf = []
for word in vocab:
    idf.append(get_idf(word))

idf = np.asarray(idf)
idf.shape

(5918,)

### Get TF-IDF Table

In [8]:
tf_idf = pd.DataFrame(tf * idf, columns=vocab)

## Generate Sentence Vector of Synopsis

In [9]:
def get_word_vec(word):
    return word_vecs[word_to_id[word]]

def get_tf_idf(word, i):
    return tf_idf.loc[i, word]

def get_sent_vec(i):
    sent_vec = np.zeros(100, dtype=np.float16)

    for word in set(syns[i]):
        sent_vec += get_word_vec(word) * get_tf_idf(word, i)
        # sent_vec += get_word_vec(word)
    
    return sent_vec / len(set(syns[i]))

In [10]:
full_sent_vec = []
for i in range(len(syns)):
    full_sent_vec.append(get_sent_vec(i))
    
sent_vec_df = pd.DataFrame(full_sent_vec)
sent_vec_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,-0.114563,-3.921875,0.508789,1.326172,2.681641,1.789062,-1.769531,-1.250977,-1.898438,1.074219,...,0.863770,-0.738281,-1.119141,-0.729980,1.062500,1.274414,-0.247559,1.378906,1.920898,2.222656
1,2.312500,-2.361328,-0.013504,2.705078,1.199219,3.171875,-1.360352,-2.603516,-1.960938,-0.565430,...,0.493652,-0.946777,-0.937988,-0.260986,2.085938,1.500000,-0.222168,0.646973,2.218750,0.959961
2,-0.225952,-2.601562,-0.183105,2.042969,1.671875,1.951172,-1.368164,-2.917969,-0.161377,0.051392,...,-1.435547,-1.021484,-2.537109,-2.455078,2.013672,1.943359,0.622559,1.703125,2.359375,1.231445
3,0.842773,-0.527344,0.271484,0.753906,1.575195,2.496094,-1.652344,-2.703125,-2.271484,1.468750,...,-0.367676,-1.789062,-1.804688,-2.474609,2.878906,0.426025,-0.582520,3.107422,0.837402,1.043945
4,1.521484,-2.650391,0.423584,1.816406,2.029297,1.696289,0.607910,-1.335938,-1.022461,0.936035,...,1.649414,-1.704102,-2.476562,-4.484375,1.402344,0.638184,0.058777,1.250977,2.457031,2.515625
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5995,0.905762,-1.422852,1.901367,2.382812,1.666992,0.901855,-1.924805,-0.843262,-1.282227,1.339844,...,1.071289,-0.565430,-1.355469,-2.287109,0.418457,1.643555,0.829102,1.502930,2.166016,2.941406
5996,2.136719,-1.450195,0.351074,2.123047,0.429688,-0.171631,-0.983887,-2.193359,-2.029297,1.022461,...,-0.837402,-1.895508,-2.449219,-2.021484,1.227539,0.562988,-1.253906,2.988281,2.492188,1.022461
5997,1.147461,-2.382812,0.937988,0.431396,1.120117,2.529297,-1.699219,-1.220703,-1.709961,1.555664,...,0.712891,-0.082397,-1.974609,-0.666992,2.466797,0.973633,1.821289,-0.183594,2.945312,0.713867
5998,0.944824,-1.447266,-0.097656,0.824707,1.160156,1.099609,-0.374756,-2.130859,-0.812012,2.248047,...,1.912109,0.847168,-1.115234,-1.066406,1.034180,1.030273,-0.781250,0.201904,2.701172,1.936523


In [11]:
sent_vec_df.insert(0, 'MAL_ID', synopsis.MAL_ID)
sent_vec_df

Unnamed: 0,MAL_ID,0,1,2,3,4,5,6,7,8,...,90,91,92,93,94,95,96,97,98,99
0,1,-0.114563,-3.921875,0.508789,1.326172,2.681641,1.789062,-1.769531,-1.250977,-1.898438,...,0.863770,-0.738281,-1.119141,-0.729980,1.062500,1.274414,-0.247559,1.378906,1.920898,2.222656
1,5,2.312500,-2.361328,-0.013504,2.705078,1.199219,3.171875,-1.360352,-2.603516,-1.960938,...,0.493652,-0.946777,-0.937988,-0.260986,2.085938,1.500000,-0.222168,0.646973,2.218750,0.959961
2,6,-0.225952,-2.601562,-0.183105,2.042969,1.671875,1.951172,-1.368164,-2.917969,-0.161377,...,-1.435547,-1.021484,-2.537109,-2.455078,2.013672,1.943359,0.622559,1.703125,2.359375,1.231445
3,7,0.842773,-0.527344,0.271484,0.753906,1.575195,2.496094,-1.652344,-2.703125,-2.271484,...,-0.367676,-1.789062,-1.804688,-2.474609,2.878906,0.426025,-0.582520,3.107422,0.837402,1.043945
4,8,1.521484,-2.650391,0.423584,1.816406,2.029297,1.696289,0.607910,-1.335938,-1.022461,...,1.649414,-1.704102,-2.476562,-4.484375,1.402344,0.638184,0.058777,1.250977,2.457031,2.515625
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5995,48438,0.905762,-1.422852,1.901367,2.382812,1.666992,0.901855,-1.924805,-0.843262,-1.282227,...,1.071289,-0.565430,-1.355469,-2.287109,0.418457,1.643555,0.829102,1.502930,2.166016,2.941406
5996,48466,2.136719,-1.450195,0.351074,2.123047,0.429688,-0.171631,-0.983887,-2.193359,-2.029297,...,-0.837402,-1.895508,-2.449219,-2.021484,1.227539,0.562988,-1.253906,2.988281,2.492188,1.022461
5997,48470,1.147461,-2.382812,0.937988,0.431396,1.120117,2.529297,-1.699219,-1.220703,-1.709961,...,0.712891,-0.082397,-1.974609,-0.666992,2.466797,0.973633,1.821289,-0.183594,2.945312,0.713867
5998,48483,0.944824,-1.447266,-0.097656,0.824707,1.160156,1.099609,-0.374756,-2.130859,-0.812012,...,1.912109,0.847168,-1.115234,-1.066406,1.034180,1.030273,-0.781250,0.201904,2.701172,1.936523


In [12]:
sent_vec_df.to_csv('synopsis_vector.csv', index=False)