# Multilabel: Get data for text mining 

In [1]:
import pandas as pd
import pandas_gbq
import numpy as np
import datalab.storage as gcs
from nltk import word_tokenize   
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
project_id = 'my_project'

In [209]:
query ='''
SELECT 
pid
, data.product.title
, data.product.img_url
FROM `data_science.product_catalog_distinct`
WHERE data.categories.cat_4="Dresses"
AND data.product.title !=''
AND data.categories.cat_1="Clothing, Shoes & Jewelry"
'''

In [210]:
data=pd.read_gbq(query, dialect="standard", project_id=project_id)

Requesting query... ok.
Job ID: 0d4ddff8-02c8-4d31-ad8c-ea0c40104cba
Query running...
  Elapsed 7.72 s. Waiting...
  Elapsed 8.89 s. Waiting...
  Elapsed 10.07 s. Waiting...
  Elapsed 11.28 s. Waiting...
  Elapsed 12.47 s. Waiting...
  Elapsed 13.65 s. Waiting...
  Elapsed 14.82 s. Waiting...
  Elapsed 16.01 s. Waiting...
Query done.
Processed: 10.7 GB Billed: 10.7 GB
Standard price: $0.05 USD

Retrieving results...
Got 384995 rows.

Total time taken 41.68 s.
Finished at 2018-05-10 14:50:02.


## Mine keywords

In [213]:
 class Lemmatizer(object):
     def __init__(self):
        self.wnl = WordNetLemmatizer()
     def __call__(self, doc):
         return [self.wnl.lemmatize(t.replace(',', '')) for t in doc.split(' ') if len(t)>1]

In [214]:
vectorizer = TfidfVectorizer(strip_accents='ascii'
                            , analyzer='word'
                            , stop_words='english'
                            , min_df=1000
                            , max_df=0.50
                            , max_features=100
                            , tokenizer=Lemmatizer())

In [215]:
x = vectorizer.fit(data.title) 

In [216]:
dress_keywords = [k for k,v in x.vocabulary_.items()]

In [218]:
print(sorted(dress_keywords))

['1950s', '3/4', 'a-line', 'applique', 'backless', 'ball', 'bandage', 'beach', 'beaded', 'belt', 'big', 'black', 'blue', 'bodycon', 'boho', 'bridal', 'bride', 'bridesmaid', 'cap', 'casual', 'chiffon', 'club', 'cocktail', 'cotton', 'deep', 'elegant', 'evening', 'fit', 'flare', 'floral', 'flower', 'formal', 'girl', "girls'", 'gown', 'halter', 'high', 'homecoming', 'knee', 'lace', 'length', 'line', 'little', 'long', 'loose', 'maternity', 'maxi', 'mermaid', 'midi', 'mini', 'mother', 'navy', 'neck', 'party', 'pencil', 'pleated', 'plus', 'pocket', 'print', 'printed', 'prom', 'red', 'retro', 'round', 'ruched', 'ruffle', 'scoop', 'sequin', 'sexy', 'sheath', 'shirt', 'short', 'shoulder', 'size', 'skater', 'sleeve', 'sleeveless', 'slim', 'solid', 'spaghetti', 'split', 'strap', 'striped', 'style', 'summer', 'sweetheart', 'swing', 't-shirt', 'tank', 'tulle', 'tunic', 'u', 'v-neck', 'vintage', 'waist', 'wedding', 'white', 'woman', 'work', 'wrap']


### Get keywords for each product

In [14]:
# Return a feature matrix of products, keywords
tfidf_matrix = vectorizer.fit_transform(data.title) 

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


In [377]:
# Check to see if the tfidf matrix is not empty for at least 100k products in the dataframe
has_kw_count = 0
for i in range(len(data)):
    a = tfidf_matrix[i,:].nonzero()[1]
    if a.any():
        has_kw_count += 1
has_kw_count

375141

In [381]:
# Get the number of products without keywords
len(data) - has_kw_count 

9854

In [640]:
# Create a dictionary of products, keywords
d = defaultdict(list)

for doc in range(len(data)):
    feature_index = tfidf_matrix[doc,:].nonzero()[1] # An array of keyword indices from the matrix
    tfidf_scores = zip(feature_index, [tfidf_matrix[doc, x] for x in feature_index]) # A zip of indices array, scores
    keyword_dict = defaultdict(list) # An empty dictionary
    for k,v in tfidf_scores: 
        keyword_dict[k] = v # Turn the zip into a dictionary of index, score, probably can cut out the zip step
    for k,v in keyword_dict.items():
        d[doc].append(feature_dict.get(k)) # A dictionary of feature name, score; Takes just the name and appends to a doc dictionary


### Add keywords back to products dataframe

In [641]:
# Make the keywords dictionary into a dataframe with the product ids as the index
k = pd.DataFrame.from_dict(d, orient = 'index') 
k = k.replace('None', np.nan)

In [718]:
print(len(data)-len(k)) # The keyword dictionary method skips products without keywords

9854


In [647]:
result = pd.concat([data, k], axis=1, join='inner')