### Building the Inverted Indices
#### Group: William Chirciu, Amy Edwards
#### CSC 575 - Online 810

### Import libraries

In [62]:
import pandas as pd
from collections import defaultdict
import re
import nltk
import math
import string
import csv
from nltk.tokenize import word_tokenize, sent_tokenize, regexp_tokenize
from nltk.corpus import stopwords

### Read in Train and Test data

In [63]:
train = pd.read_csv('train_new.csv', delimiter = '\t')
test = pd.read_csv('test_new.csv', delimiter = '\t')
train = train.drop(['id', 'relevance'], axis=1)
test = test.drop(['id'], axis=1)
train.head()

Unnamed: 0,product_uid,product_title,search_term
0,100001,Simpson Strong-Tie 12-Gauge Angle,angle bracket
1,100001,Simpson Strong-Tie 12-Gauge Angle,l bracket
2,100002,BEHR Premium Textured DeckOver 1-gal. #SC-141 ...,deck over
3,100005,Delta Vero 1-Handle Shower Only Faucet Trim Ki...,rain shower head
4,100005,Delta Vero 1-Handle Shower Only Faucet Trim Ki...,shower only faucet


In [64]:
test.head()

Unnamed: 0,product_uid,product_title,search_term
0,100001,Simpson Strong-Tie 12-Gauge Angle,metal l brackets
1,100001,Simpson Strong-Tie 12-Gauge Angle,simpson sku able
2,100001,Simpson Strong-Tie 12-Gauge Angle,simpson strong ties
3,100001,Simpson Strong-Tie 12-Gauge Angle,simpson strong tie hcc668
4,100003,STERLING Ensemble 33-1/4 in. x 60 in. x 75-1/4...,bath and shower kit


### Combine Train and Test Data into single dataframe

In [65]:
df = train.append(test)
df.head()

Unnamed: 0,product_uid,product_title,search_term
0,100001,Simpson Strong-Tie 12-Gauge Angle,angle bracket
1,100001,Simpson Strong-Tie 12-Gauge Angle,l bracket
2,100002,BEHR Premium Textured DeckOver 1-gal. #SC-141 ...,deck over
3,100005,Delta Vero 1-Handle Shower Only Faucet Trim Ki...,rain shower head
4,100005,Delta Vero 1-Handle Shower Only Faucet Trim Ki...,shower only faucet


#### Compute the total number of documents

In [66]:
N = df['product_uid'].nunique()    #total number of documents
N

106650

### String Formatting
#### Handles units, special characters, decimal numbers, parentheses. and sizes
#### Credit to: S. Li, “str_stem,” GitHubGist, Oct-2018. [Online]. Available: https://gist.github.com/susanli2016/b83d148de7394821509bd5172d2c96d3

In [67]:
stemmer = nltk.PorterStemmer()
def str_stem(s): 
    if isinstance(s, str):
        s = re.sub(r"([0-9])( *)\.( *)([0-9])", r"\1.\4", s)
        s = re.sub(r"([0-9]+)( *)(inches|inch|in|')\.?", r"\1in. ", s)
        s = re.sub(r"([0-9]+)( *)(foot|feet|ft|'')\.?", r"\1ft. ", s)
        s = re.sub(r"([0-9]+)( *)(pounds|pound|lbs|lb)\.?", r"\1lb. ", s)
        s = re.sub(r"([0-9]+)( *)(square|sq) ?\.?(feet|foot|ft)\.?", r"\1sq.ft. ", s)
        s = re.sub(r"([0-9]+)( *)(cubic|cu) ?\.?(feet|foot|ft)\.?", r"\1cu.ft. ", s)
        s = re.sub(r"([0-9]+)( *)(gallons|gallon|gal)\.?", r"\1gal. ", s)
        s = re.sub(r"([0-9]+)( *)(ounces|ounce|oz)\.?", r"\1oz. ", s)
        s = re.sub(r"([0-9]+)( *)(centimeters|cm)\.?", r"\1cm. ", s)
        s = re.sub(r"([0-9]+)( *)(milimeters|mm)\.?", r"\1mm. ", s)
        s = re.sub(r"([0-9]+)( *)(°|degrees|degree)\.?", r"\1 deg. ", s)
        s = re.sub(r"([0-9]+)( *)(v|volts|volt)\.?", r"\1 volt. ", s)
        s = re.sub(r"([0-9]+)( *)(wattage|watts|watt)\.?", r"\1 watt. ", s)
        s = re.sub(r"([0-9]+)( *)(amperes|ampere|amps|amp)\.?", r"\1 amp. ", s)
        s = re.sub(r"([0-9]+)( *)(qquart|quart)\.?", r"\1 qt. ", s)
        s = re.sub(r"([0-9]+)( *)(hours|hour|hrs.)\.?", r"\1 hr ", s)
        s = re.sub(r"([0-9]+)( *)(gallons per minute|gallon per minute|gal per minute|gallons/min.|gallons/min)\.?", r"\1 gal. per min. ", s)
        s = re.sub(r"([0-9]+)( *)(gallons per hour|gallon per hour|gal per hour|gallons/hour|gallons/hr)\.?", r"\1 gal. per hr ", s)
        # Deal with special characters
        s = s.replace("$"," ")
        s = s.replace("?"," ")
        s = s.replace("..."," ")
        s = s.replace(".."," ")
        s = s.replace("&nbsp;"," ")
        s = s.replace("&amp;","&")
        s = s.replace("&#39;","'")
        s = s.replace("/>/Agt/>","")
        s = s.replace("</a<gt/","")
        s = s.replace("gt/>","")
        s = s.replace("/>","")
        s = s.replace("<br","")
        s = s.replace("<.+?>","")
        s = s.replace("[ &<>)(_,;:!?\+^~@#\$]+"," ")
        s = s.replace("'s\\b","")
        s = s.replace("[']+","")
        s = s.replace("[\"]+","")
        s = s.replace("-"," ")
        s = s.replace("+"," ")
        # Remove text between paranthesis/brackets)
        s = s.replace("[ ]?[[(].+?[])]","")
        # remove sizes
        s = s.replace("size: .+$","")
        s = s.replace("size [0-9]+[.]?[0-9]+\\b","")
        
        
        return " ".join([stemmer.stem(re.sub('[^A-Za-z0-9-./]', ' ', word)) for word in s.lower().split()])
    else:
        return "null"

### Creating Inverted Index for Product Titles

In [68]:
inverted_index = {}                        #Inverted Index of the 'product_title' documents
docs = {}                                  #Dictionary with 'product_uid' as key and raw term frequency as value
porter = nltk.PorterStemmer()
for index, row in df.iterrows():
    doc_id = row['product_uid']
    document = row['product_title']
    if doc_id in docs:
        continue
    document = str_stem(document)
    tokens = word_tokenize(document)
    tokens = [w for w in tokens if w not in stopwords.words('english')]
    tokens = [w for w in tokens if w != '.' and w != '/']
    docs[doc_id] = len(tokens)
    for tok in tokens:
        if tok not in inverted_index:
            inverted_index[tok] = (0.0,{})
        inverted_index[tok][1][doc_id] = tokens.count(tok)

#### Compute IDFs for each term

In [69]:
for key in inverted_index:
    idf = math.log10(N / len(inverted_index[key][1]))
    postings = inverted_index[key][1]
    inverted_index[key] = (idf,postings)
    

### Read in Product Descriptions

In [70]:
description = pd.read_csv('product_descriptions_new.csv', delimiter = '\t', header = 0,quoting=csv.QUOTE_NONE, error_bad_lines=False)
description.head()

Unnamed: 0,product_uid,product_description
0,100001,"Not only do angles make joints stronger, they ..."
1,100002,BEHR Premium Textured DECKOVER is an innovativ...
2,100003,Classic architecture meets contemporary design...
3,100004,The Grape Solar 265-Watt Polycrystalline PV So...
4,100005,Update your bathroom with the Delta Vero Singl...


#### Merge Descriptions with full Dataframe

In [71]:
df_full = df.merge(description, on = 'product_uid', how = 'left')
df_full.head()

Unnamed: 0,product_uid,product_title,search_term,product_description
0,100001,Simpson Strong-Tie 12-Gauge Angle,angle bracket,"Not only do angles make joints stronger, they ..."
1,100001,Simpson Strong-Tie 12-Gauge Angle,l bracket,"Not only do angles make joints stronger, they ..."
2,100002,BEHR Premium Textured DeckOver 1-gal. #SC-141 ...,deck over,BEHR Premium Textured DECKOVER is an innovativ...
3,100005,Delta Vero 1-Handle Shower Only Faucet Trim Ki...,rain shower head,Update your bathroom with the Delta Vero Singl...
4,100005,Delta Vero 1-Handle Shower Only Faucet Trim Ki...,shower only faucet,Update your bathroom with the Delta Vero Singl...


### Creating Inverted Index for Product Descriptions

In [72]:
inverted_index_desc = {}                            #Inverted Index of the 'product_descriptions' documents
docs_desc = {}                                      #Dictionary with 'product_uid' as key and raw term frequency as value                                  
for index, row in df_full.iterrows():
    doc_id = row['product_uid']
    document = row['product_description']
    if doc_id in docs_desc:
        continue
    document = str_stem(document)
    tokens = word_tokenize(document)
    tokens = [w for w in tokens if w not in stopwords.words('english')]
    tokens = [w for w in tokens if w != '.' and w != '/']
    docs_desc[doc_id] = len(tokens)
    for tok in tokens:
        if tok not in inverted_index_desc:
            inverted_index_desc[tok] = (0.0,{})
        inverted_index_desc[tok][1][doc_id] = tokens.count(tok)

#### Compute IDFs for each term

In [73]:
for key in inverted_index_desc:
    idf = math.log10(N / len(inverted_index_desc[key][1]))
    postings = inverted_index_desc[key][1]
    inverted_index_desc[key] = (idf,postings)

### Read in Attributes

In [74]:
att = pd.read_csv('attributes_new.csv', delimiter = '\t', header = 0,quoting=csv.QUOTE_NONE, error_bad_lines=False)
att = att.drop(['name'], axis=1)
att = att.dropna()
att.product_uid = att.product_uid.astype(int)
df_all = df_full.merge(att, on = 'product_uid', how = 'left')
df_all.head()

b'Skipping line 903812: expected 3 fields, saw 4\nSkipping line 943376: expected 3 fields, saw 4\n'
b'Skipping line 1660255: expected 3 fields, saw 4\n'


Unnamed: 0,product_uid,product_title,search_term,product_description,value
0,100001,Simpson Strong-Tie 12-Gauge Angle,angle bracket,"Not only do angles make joints stronger, they ...",Versatile connector for various 90 connections...
1,100001,Simpson Strong-Tie 12-Gauge Angle,angle bracket,"Not only do angles make joints stronger, they ...",Stronger than angled nailing or screw fastenin...
2,100001,Simpson Strong-Tie 12-Gauge Angle,angle bracket,"Not only do angles make joints stronger, they ...",Help ensure joints are consistently straight a...
3,100001,Simpson Strong-Tie 12-Gauge Angle,angle bracket,"Not only do angles make joints stronger, they ...",Dimensions: 3 in. x 3 in. x 1-1/2 in.
4,100001,Simpson Strong-Tie 12-Gauge Angle,angle bracket,"Not only do angles make joints stronger, they ...",Made from 12-Gauge steel


### Creating Inverted Index for Product Attributes

In [75]:
inverted_index_att = {}                                     #Inverted Index of the 'product_attribute' documents
docs_att = {}                                               #Dictionary with 'product_uid' as key and raw term frequency as value
porter = nltk.PorterStemmer()
for index, row in att.iterrows():
    doc_id = row['product_uid']
    document = row['value']
    if doc_id in docs_att:
        continue
    document = str_stem(document)
    tokens = word_tokenize(document)
    tokens = [w for w in tokens if w not in stopwords.words('english')]
    tokens = [w for w in tokens if w != '.' and w != '/']
    docs_att[doc_id] = len(tokens)
    for tok in tokens:
        if tok not in inverted_index_att:
            inverted_index_att[tok] = (0.0,{})
        inverted_index_att[tok][1][doc_id] = tokens.count(tok)

#### Compute IDFs for each term

In [76]:
for key in inverted_index_att:
    idf = math.log10(N / len(inverted_index_att[key][1]))
    postings = inverted_index_att[key][1]
    inverted_index_att[key] = (idf,postings)

### Creating DocLen Files for each document type
#### Dictionaries with product_uid as key and tfxidf vector weights as the value

In [77]:
DL_title = {}
for doc in docs:
    docLen = 0.0
    for term in inverted_index:
        if doc not in inverted_index[term][1]:
            continue
        tf = inverted_index[term][1][doc]
        idf = inverted_index[term][0]
        tfidf = tf * idf
        docLen = docLen + math.pow(tfidf, 2.0)
    DL_title[doc] = math.sqrt(docLen)

In [78]:
DL_description = {}
for doc in docs_desc:
    docLen = 0.0
    for term in inverted_index_desc:
        if doc not in inverted_index_desc[term][1]:
            continue
        tf = inverted_index_desc[term][1][doc]
        idf = inverted_index_desc[term][0]
        tfidf = tf * idf
        docLen = docLen + math.pow(tfidf, 2.0)
    DL_description[doc] = math.sqrt(docLen)

In [79]:
DL_attribute = {}
for doc in docs_att:
    docLen = 0.0
    for term in inverted_index_att:
        if doc not in inverted_index_att[term][1]:
            continue
        tf = inverted_index_att[term][1][doc]
        idf = inverted_index_att[term][0]
        tfidf = tf * idf
        docLen = docLen + math.pow(tfidf, 2.0)
    DL_attribute[doc] = math.sqrt(docLen)

### Write Product Title Inverted Index to csv

In [80]:
with open('product_title_inverted_index_revised.csv', 'w') as csv_file:
    csvwriter = csv.writer(csv_file, delimiter='\t')
    for term in sorted(inverted_index):
        idf = inverted_index[term][0]
        csvwriter.writerow([term, idf, inverted_index[term][1]])

### Write Product Description Inverted Index to csv

In [81]:
with open('product_description_inverted_index_revised.csv', 'w') as csv_file:
    csvwriter = csv.writer(csv_file, delimiter='\t')
    for term in sorted(inverted_index_desc):
        idf = inverted_index_desc[term][0]
        csvwriter.writerow([term, idf, inverted_index_desc[term][1]])

### Write Product Attribute Inverted Index to csv

In [89]:
with open('product_attribute_inverted_index_revised.csv', 'w') as csv_file:
    csvwriter = csv.writer(csv_file, delimiter='\t')
    for term in sorted(inverted_index_att):
        idf = inverted_index_att[term][0]
        csvwriter.writerow([term, idf, inverted_index_att[term][1]])

### Write DocumentID -> tfidf to csv

In [83]:
with open('title_tfidf_revised.csv', 'w') as csv_file:
    csvwriter = csv.writer(csv_file, delimiter='\t')
    for d in DL_title:
        csvwriter.writerow([d,DL_title[d]])

In [84]:
with open('description_tfidf_revised.csv', 'w') as csv_file:
    csvwriter = csv.writer(csv_file, delimiter='\t')
    for d in DL_description:
        csvwriter.writerow([d,DL_description[d]])

In [85]:
with open('attribute_tfidf_revised.csv', 'w') as csv_file:
    csvwriter = csv.writer(csv_file, delimiter='\t')
    for d in DL_attribute:
        csvwriter.writerow([d,DL_attribute[d]])

### Write DocumentID -> DocLen files to csv

In [86]:
with open('title_doc_index_revised.csv', 'w') as csv_file:
    csvwriter = csv.writer(csv_file, delimiter='\t')
    for d in docs:
        csvwriter.writerow([d,docs[d]])

In [87]:
with open('description_doc_index_revised.csv', 'w') as csv_file:
    csvwriter = csv.writer(csv_file, delimiter='\t')
    for d in docs_desc:
        csvwriter.writerow([d,docs_desc[d]])

In [88]:
with open('attribute_doc_index_revised.csv', 'w') as csv_file:
    csvwriter = csv.writer(csv_file, delimiter='\t')
    for d in docs_att:
        csvwriter.writerow([d,docs_att[d]])