## DS5559 - Project
## Notebook 2 - Make Vector Space
#### Name: Mengyao Zhang (mz6jv), Runhao Zhao (rz6dg)

# Synopsis
Use case: create vestor space model

# Configuration

In [1]:
db_name = 'project.db'
OHCO = ['book_num','chap_num', 'para_num', 'sent_num', 'token_num']
BOOKS = OHCO[:1] 
CHAPS = OHCO[:2]
PARAS = OHCO[:3]
SENTS = OHCO[:4]

In [2]:
BAG = PARAS

# Libraries

In [3]:
import sqlite3
import pandas as pd
import numpy as np
import nltk

In [6]:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
nltk.download('tagsets')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /home/mz6jv/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/mz6jv/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to /home/mz6jv/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package tagsets to /home/mz6jv/nltk_data...
[nltk_data]   Package tagsets is already up-to-date!
[nltk_data] Downloading package wordnet to /home/mz6jv/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# Process

In [4]:
with sqlite3.connect(db_name) as db:
    K = pd.read_sql('SELECT * FROM token', db, index_col=OHCO)
    V = pd.read_sql('SELECT * FROM vocab', db, index_col='term_id')

In [8]:
V.head()

Unnamed: 0_level_0,term_str,n,p,port_stem,stop
term_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,*ah,1,1.63482e-07,*ah,0
1,.but,2,3.269639e-07,.but,0
2,.he,1,1.63482e-07,.he,0
3,.i,5,8.174098e-07,.i,0
4,.is,1,1.63482e-07,.i,0


In [12]:
V.n.describe()

count     45072.000000
mean        135.713592
std        2557.581371
min           1.000000
25%           1.000000
50%           4.000000
75%          19.000000
max      240885.000000
Name: n, dtype: float64

In [13]:
V.shape

(45072, 5)

In [14]:
V[V.n>=20].n.count() # find count of terms appearing >=20 times (pick 20 b/c ~ the 75th percentile)

10970

## Create DTM


### Create word mask

Filter out stopwords, punctuations and numbers.

In [8]:
WORDS = (K.punc == 0) & (K.num == 0) & K.term_id.isin(V[V.stop==0].index)

### Extract BOW from tokens

In [12]:
BAG

['book_num', 'chap_num', 'para_num']

In [9]:
# a function to create BOW from tokens table
# token_df: the tokens data frame
# bag_name: choice of OHCO container
# item_name: choice of item to count
# word_mask
def get_bow(token_df, bag_name, item_name,word_mask):
    bow = token_df[word_mask].groupby(bag_name+[item_name])[item_name].count()
    return bow

In [10]:
BOW = get_bow(K,BAG,'term_id',WORDS)

In [11]:
BOW.head()

book_num  chap_num  para_num  term_id
1         0         0         6874       1
                    1         6192       1
                    2         4907       1
                              7087       1
                              7272       1
Name: term_id, dtype: int64

In [12]:
BOW.shape

(2505581,)

In [13]:
# changed series to dataframe
BOW = BOW.to_frame().rename(columns={'term_id':'n'})

In [14]:
BOW.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,n
book_num,chap_num,para_num,term_id,Unnamed: 4_level_1
1,0,0,6874,1
1,0,1,6192,1
1,0,2,4907,1
1,0,2,7087,1
1,0,2,7272,1


In [15]:
BOW_new = BOW.reset_index()

In [16]:
BOW_new.head()

Unnamed: 0,book_num,chap_num,para_num,term_id,n
0,1,0,0,6874,1
1,1,0,1,6192,1
2,1,0,2,4907,1
3,1,0,2,7087,1
4,1,0,2,7272,1


In [17]:
# filter out infrequent vocab (term that appeared less than 20 times in corpus)
BOW_new = BOW_new[BOW_new.term_id.isin(V[V.n >= 20].index)]

In [18]:
BOW_new.shape

(2362681, 5)

In [19]:
BOW_new = BOW_new.set_index(OHCO[:3]+['term_id'])

In [20]:
BOW_new.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,n
book_num,chap_num,para_num,term_id,Unnamed: 4_level_1
1,0,0,6874,1
1,0,1,6192,1
1,0,2,4907,1
1,0,2,7087,1
1,0,2,7272,1


In [4]:
# with sqlite3.connect(db_name) as db:
#     BOW_new = pd.read_sql('SELECT * FROM BOW', db, index_col=(OHCO[:3]+['term_id']))    

In [5]:
BOW_new.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,n
book_num,chap_num,para_num,term_id,Unnamed: 4_level_1
1,0,0,6874,1
1,0,1,6192,1
1,0,2,4907,1
1,0,2,7087,1
1,0,2,7272,1


## Convert BOW to DTM

#### Unstack BOW

In [6]:
DTM = BOW_new.unstack().fillna(0) # takes ~ 30s

In [7]:
DTM.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,n,n,n,n,n,n,n,n,n,n,n,n,n,n,n,n,n,n,n,n,n
Unnamed: 0_level_1,Unnamed: 1_level_1,term_id,18,22,23,32,35,40,43,50,62,72,...,44915,44929,44930,44933,44940,44944,44958,44963,44969,45035
book_num,chap_num,para_num,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2
1,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
DTM.columns = DTM.columns.droplevel(0) 

In [9]:
DTM.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,term_id,18,22,23,32,35,40,43,50,62,72,...,44915,44929,44930,44933,44940,44944,44958,44963,44969,45035
book_num,chap_num,para_num,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
1,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
DTM.shape

(108658, 10817)

#### Reduce number of terms
Terms appearing in less than 0.1% of the documents were deemed to be less important, so we decided to remove them.

In [10]:
# to reduce size, remove terms appearing in less than 0.1% of the documents
new_DTM = DTM.loc[:,(1-(DTM == 0).sum(0)/DTM.shape[0]) > 0.001]

In [11]:
# check shape ---- terms reduced to around 3,500
new_DTM.shape

(108658, 3459)

In [18]:
new_DTM.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,term_id,23,40,95,106,140,144,150,153,161,162,...,44860,44864,44872,44873,44880,44904,44911,44914,44929,44930
book_num,chap_num,para_num,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
1,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Prepare data for calculating TFIDF

In [12]:
DTM_sklearn = new_DTM.reset_index()

In [13]:
DTM_sklearn.head()

term_id,book_num,chap_num,para_num,23,40,95,106,140,144,150,...,44860,44864,44872,44873,44880,44904,44911,44914,44929,44930
0,1,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1,0,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1,0,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1,0,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
# save the OHCO indices
index_df = DTM_sklearn[BAG]

In [15]:
index_df.head()

term_id,book_num,chap_num,para_num
0,1,0,0
1,1,0,1
2,1,0,2
3,1,0,3
4,1,0,4


In [23]:
# save column names
col_names = DTM_sklearn.columns

In [24]:
col_names

Index(['book_num', 'chap_num', 'para_num',         23,         40,         95,
              106,        140,        144,        150,
       ...
            44860,      44864,      44872,      44873,      44880,      44904,
            44911,      44914,      44929,      44930],
      dtype='object', name='term_id', length=3462)

In [26]:
# input to sklearn TfidfTransformer 
X = DTM_sklearn.drop(BAG,axis=1)

In [27]:
X.head()

term_id,23,40,95,106,140,144,150,153,161,162,...,44860,44864,44872,44873,44880,44904,44911,44914,44929,44930
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [28]:
# change df to numpy array
X_1 = np.array(X)

In [29]:
X_1

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

## Compute TFIDF

In [30]:
from sklearn.feature_extraction.text import TfidfTransformer

# initiate instance
transformer = TfidfTransformer()
# fit and transform on processed DTM
tfidf = transformer.fit_transform(X_1)

In [35]:
type(tfidf)

scipy.sparse.csr.csr_matrix

In [31]:
# change csr_matrix to np array 
tfidf_np = tfidf.toarray()

In [32]:
tfidf_np

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

#### calculate sum,mean and max for each column
(i.e. the tfidf sum, mean and max for each term in the corpus)

In [33]:
tfidf_sum = tfidf_np.sum(axis=0)
tfidf_mean = tfidf_np.mean(axis=0)
tfidf_max = tfidf_np.max(axis=0)

In [34]:
# create a df showing the tfidf sum, mean, max for each term 
tfidf_stat = pd.DataFrame({'term_id':col_names[3:len(col_names)],'tfidf_sum':tfidf_sum,
                         'tfidf_mean':tfidf_mean,'tfidf_max':tfidf_max})

In [35]:
tfidf_stat.head()

Unnamed: 0,term_id,tfidf_sum,tfidf_mean,tfidf_max
0,23,24.897749,0.000229,0.595259
1,40,57.759145,0.000532,0.893906
2,95,212.507693,0.001956,0.832875
3,106,26.184027,0.000241,0.68665
4,140,74.903485,0.000689,0.852027


In [36]:
tfidf_stat.shape

(3459, 4)

In [37]:
tfidf_stat = tfidf_stat.set_index('term_id')

In [38]:
tfidf_stat.head()

Unnamed: 0_level_0,tfidf_sum,tfidf_mean,tfidf_max
term_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
23,24.897749,0.000229,0.595259
40,57.759145,0.000532,0.893906
95,212.507693,0.001956,0.832875
106,26.184027,0.000241,0.68665
140,74.903485,0.000689,0.852027


In [53]:
tfidf_df = pd.DataFrame(tfidf_np)

In [54]:
tfidf_df.shape

(108658, 3459)

In [56]:
# add meta data (OHCO info) back
TFIDF = pd.concat([index_df,tfidf_df],axis=1,ignore_index=True)

In [59]:
TFIDF.columns = col_names

In [60]:
TFIDF.head()

term_id,book_num,chap_num,para_num,23,40,95,106,140,144,150,...,44860,44864,44872,44873,44880,44904,44911,44914,44929,44930
0,1,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1,0,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1,0,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1,0,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
TFIDF = TFIDF.set_index(BAG)

## Add stats to V

In [39]:
V =  pd.concat([V, tfidf_stat], axis=1)

In [40]:
V = V.fillna(0) # fill na with 0 for terms eliminated when creating BOW

In [43]:
V.head()

Unnamed: 0_level_0,term_str,n,p,port_stem,stop,tfidf_sum,tfidf_mean,tfidf_max
term_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,*ah,1,1.63482e-07,*ah,0,0.0,0.0,0.0
1,.but,2,3.269639e-07,.but,0,0.0,0.0,0.0
2,.he,1,1.63482e-07,.he,0,0.0,0.0,0.0
3,.i,5,8.174098e-07,.i,0,0.0,0.0,0.0
4,.is,1,1.63482e-07,.i,0,0.0,0.0,0.0


## Create Docs table

In [44]:
# DTM as np array
X_1

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [45]:
# find sum for each row (each doc)
term_count = X_1.sum(axis=1)

In [46]:
term_count_df = pd.DataFrame({'term_count':term_count})

In [47]:
D =  pd.concat([index_df, term_count_df], axis=1).set_index(OHCO[:3])

In [48]:
D['tf'] = D.term_count / D.term_count.sum()

In [49]:
D.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,term_count,tf
book_num,chap_num,para_num,Unnamed: 3_level_1,Unnamed: 4_level_1
1,0,0,0.0,0.0
1,0,1,1.0,4.433379e-07
1,0,2,24.0,1.064011e-05
1,0,3,18.0,7.980082e-06
1,0,4,41.0,1.817685e-05


In [50]:
D.shape

(108658, 2)

## Create bags table

In [None]:
# create bags to save OHCO structure for future use (ex: PCA)
bags = index_df.copy()
bags.columns.name = None
bags.index.names = ['bag_id']

In [30]:
bags.head()

Unnamed: 0_level_0,book_num,chap_num,para_num
bag_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,1,0,0
1,1,0,1
2,1,0,2
3,1,0,3
4,1,0,4


#### Add book author info to bags

In [None]:
book_meta = pd.read_csv('Book_Index.csv')

In [None]:
book_meta.head()

In [None]:
bags_meta = bags.set_index('book_num').join(book_meta.set_index('book_num'),how='left')

In [None]:
bags_meta = bags_meta.reset_index()
bags_meta.index.names = ['bag_id']

## Get all doc pairs

In [21]:
chap_ids = D.index.tolist()
pairs = [(i,j) for i in chap_ids for j in chap_ids if j > i]
P = pd.DataFrame(pairs).reset_index(drop=True).set_index([0,1])
P.index.names = ['doc_x','doc_y']

In [22]:
P.head()

doc_x,doc_y
0,1
0,2
0,3
0,4
0,5


## Compute Euclidean distance

In [23]:
def euclidean(row):
    D1 = TFIDF.loc[row.name[0]] # as pd series?
    D2 = TFIDF.loc[row.name[1]]
    x = (D1 - D2)**2
    y = x.sum() 
    z = np.sqrt(y)
    return z

In [24]:
P['euclidean'] = 0
P['euclidean'] = P.apply(euclidean, 1)

In [25]:
P.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,euclidean
doc_x,doc_y,Unnamed: 2_level_1
0,1,0.195388
0,2,0.170974
0,3,0.183043
0,4,0.194519
0,5,0.166545


## Compute Cosine similarity

In [26]:
def cosine(row):
    D1 = TFIDF.loc[row.name[0]]
    D2 = TFIDF.loc[row.name[1]]
    x = D1 * D2
    y = x.sum()
    #a = np.sqrt(D1.sum()**2)
    #b = np.sqrt(D2.sum()**2)
    a = np.sqrt((D1**2).sum())
    b = np.sqrt((D2**2).sum())
    c = np.sqrt(a) * np.sqrt(b)
    z = y / c
    return z

In [27]:
P['cosine'] = P.apply(cosine, 1)

In [28]:
P.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,euclidean,cosine
doc_x,doc_y,Unnamed: 2_level_1,Unnamed: 3_level_1
0,1,0.195388,0.014051
0,2,0.170974,0.011021
0,3,0.183043,0.014445
0,4,0.194519,0.007439
0,5,0.166545,0.008341


# Save data

In [29]:
with sqlite3.connect(db_name) as db:
    V.to_sql('vocab', db, if_exists='replace', index=True)
    K.to_sql('token', db, if_exists='replace', index=True)
    D.to_sql('doc', db, if_exists='replace', index=True)
    #P.to_sql('docpair', db, if_exists='replace', index=True)
    BOW_new.to_sql('BOW', db, if_exists='replace', index=True)
    bags_meta.to_sql('bag', db, if_exists='replace', index=True) 
#     TFIDF.stack().to_frame().rename(columns={0:'term_weight'})\
#         .to_sql('dtm_tfidf', db, if_exists='replace', index=True)
# We didn't save the full TFIDF because it is too big

In [18]:
# END