In [297]:
%%writefile convert-text.py
import pandas as pd
import numpy as np
import pickle as pkl
import argparse as ap

import os
import re
import string
import html

from collections import defaultdict

parser = ap.ArgumentParser()
parser.add_argument('source_dir',
                    help='directory containing dataset and match files to split')
parser.add_argument('dest_dir',
                    help='directory to save split dataset csvs')
parser.add_argument('mapping_file',
                    help='double dictionary containing maps to-from words\
                          and vocabulary indices')
parser.add_argument('--set1', '-s1', default='set1.csv',
                    help='filename of first dataset csv')
parser.add_argument('--set2', '-s2', default='set2.csv',
                    help='filename of second dataset csv')
parser.add_argument('--matches', '-m', default='matches.csv',
                    help='filename of positives matches csv')
parser.add_argument('--indices', '-i', nargs='+', type=int,
                    help='indices of columns to be converted (starting from 0)')
parser.add_argument('--verbose', '-v', action='store_true',
                    help='print statistics')

# parse command line arguments
args = parser.parse_args()
source_dir = args.source_dir
dest_dir = args.dest_dir
mapping_file = args.mapping_file
column_idxs = args.indices

verbose = args.verbose

set1 = args.set1
set2 = args.set2
matches = args.matches

if verbose:
    print('Loading datasets and maps.')
# load data
# df_pos is loaded so that it can be copied to destination directory
df1 = pd.read_csv(os.path.join(source_dir, set1), encoding = "latin1")
df2 = pd.read_csv(os.path.join(source_dir, set2), encoding = "latin1")
df_pos = pd.read_csv(os.path.join(source_dir, matches), encoding = "latin1")

# load double dictionary
with open(mapping_file, 'rb') as f:
    map = pkl.load(f)

# change word2idx to a default dictionary that outputs 0 for unknown tokens
map['word2idx'] = defaultdict(int, map['word2idx'])

def clean_text(x):
    "formats a single string"
    if not isinstance(x, str):
        return 'NaN'
    
    # separate possessives with spaces
    x = x.replace('\'s', ' \'s')
    
    # convert html escape characters to regular characters
    x = html.unescape(x)
    
    # separate punctuations with spaces
    def pad(x):
        match = re.findall(r'.', x[0])[0]
        match_clean = ' ' + match + ' '
        return match_clean
    rx = r'\(|\)|/|!|#|\$|%|&|\\|\*|\+|,|:|;|<|=|>|\?|@|\[|\]|\^|_|{|}|\||'
    rx += r'`|~'
    x = re.sub(rx, pad, x)
    
    # remove decimal parts of version numbers
    def v_int(x):
        return re.sub('\.\d+','',x[0])
    x = re.sub(r'v\d+\.\d+', v_int, x)
    
    return x

if verbose:
    print('Cleaning the following columns from set1:')
    for column in df1.columns[column_idxs]:
        print(column, end=' ')
    print()
    print('Cleaning the following columns from set2:')
    for column in df2.columns[column_idxs]:
        print(column, end=' ')
    print()

df1.iloc[:, column_idxs] = df1.iloc[:, column_idxs].applymap(clean_text)
df2.iloc[:, column_idxs] = df2.iloc[:, column_idxs].applymap(clean_text)

def record2idx(x):
    x = x.split()
    for i, token in enumerate(x):
        idx = map['word2idx'][token]
        if idx == 0:
            idx = map['word2idx'][token.lower()]
        if idx == 0:
            idx = map['word2idx'][string.capwords(token)]
        if idx == 0:
            idx = map['word2idx'][token.upper()]
        x[i] = idx
    return x

if verbose:
    print('Converting tokens to indices.')
df1.iloc[:, column_idxs] = df1.iloc[:, column_idxs].applymap(record2idx)
df2.iloc[:, column_idxs] = df2.iloc[:, column_idxs].applymap(record2idx)

if not os.path.isdir(dest_dir):
    os.mkdir(dest_dir)
    if verbose:
        print('Creating destination directory')
    
df1.to_csv(os.path.join(dest_dir, set1), index=False)
df2.to_csv(os.path.join(dest_dir, set2), index=False)
df_pos.to_csv(os.path.join(dest_dir, matches), index=False)

Overwriting convert-text.py


# Prototyping

In [287]:
df2

Unnamed: 0,id2,id,upc,brand,groupname,title,price,shelfdescr,shortdescr,longdescr,imageurl,modelno,shipweight,length,width,height
0,1,14249992,64109216245,Draper,Electronics - General,Draper Infrared Remote Transmitter,58.45,Infrared transmitter. 3-button operation for i...,,DR1143Infrared transmitter. 3-button operation...,http : / / i.walmartimages.com / i / mp / 00...,121066,2.000,,,
1,2,10928662,1034383417,Epson,Monitors,Epson 1500 Hours 200W UHE Projector Lamp ELPLP12,438.84,,Epson ELPLP12 Replacement Lamp,EPSON ELPLP12 1500HRS 200V REPL LAMP FOR LAMP ...,http : / / i.walmartimages.com / i / p / 00 ...,ELPLP12,0.950,6.75,5.75,5.50
2,3,11961447,80844700031,Comprehensive,TV Accessories,Comprehensive Two-Piece 75 Precision BNC Jack ...,59.25,Comprehensive s True 75 connectors eliminate i...,,CH1151Comprehensive s True 75 connectors elimi...,http : / / i.walmartimages.com / i / mp / 00...,BJ-2C7559,0.050,,,
3,4,13044637,79006932160,D-Link,Garden - General,D-Link DCS-1100 Network Camera,99.82,Surveillance Network Camera Built-in Omni-dir...,The D-Link DCS-1100 Network Camera comes with ...,Surveillance Network Camera Built-in Omni-dir...,http : / / i.walmartimages.com / i / p / 00 ...,DCS-1100,,,,
4,5,13214131,6503081691,StarTech,Electronics - General,StarTech.com RKPW247015 24 Outlet Power Strip,59.00,Fits most 42U racks and equipment cabinets 10...,The StarTech.com RKPW247015 24 Outlet Power St...,24 Outlet Power Strip solution for your rackmo...,http : / / i.walmartimages.com / i / p / 00 ...,RKPW247015,5.250,72.50,2.50,1.50
5,6,13215629,2120052175,3M,Furniture,3M FR530CB Ergonomic Adjustable Footrest,67.88,Material Steel Features Non-skid Color Charc...,This adjustable footrest offers an ergonomic d...,Material Steel Features Non-skid Color Charc...,http : / / i.walmartimages.com / i / p / 00 ...,FR530CB,10.500,21.60,13.50,4.05
6,7,13216122,8869859138,HP,Stationery & amp ; Office Machinery,HP C6020B Coated Paper 1 roll 36-inches x 150 ft,49.88,Bright white coated paper For use with Inkjet...,The HP C6020B Coated Paper is ideal for color-...,Bright white coated paper For use with Inkjet...,http : / / i.walmartimages.com / i / p / 00 ...,C6020B,9.780,33.98,3.83,3.83
7,8,13216211,78541410985,Visioneer,Electronics - General,Visioneer RW120-WU RoadWarrior Sheetfed Scanner,199.98,Sheetfed color scanner 600 dpi USB connection,The Visioneer RoadWarrior is the perfect compa...,Sheetfed color scanner Lightweight and portab...,http : / / i.walmartimages.com / i / p / 00 ...,RW120-WU,1.850,11.40,2.00,1.50
8,9,13221597,1250261176,Brother,Stationery & amp ; Office Machinery,Brother DK2113 QL Label Printers Continuous Le...,64.88,Clear label tape Ideal for sign and banner cr...,Make sign and banner creation a breeze. Make t...,Clear label tape Ideal for sign and banner cr...,http : / / i.walmartimages.com / i / p / 00 ...,DK2113,0.550,4.50,4.50,3.60
9,10,13215623,5113580710,3M,Electronics - General,3M MW310LE Gel Mouse Pad,17.88,Mouse Pad Features Antimicrobial non-skid bas...,Mouse pad with wrist rest features 3M Precise ...,Mouse Pad Features Antimicrobial non-skid bas...,http : / / i.walmartimages.com / i / p / 00 ...,MW310LE,0.950,9.63,7.65,0.36


In [289]:
import pandas as pd
import numpy as np
import pickle as pkl
import argparse as ap

import os
import re
import string
import html

from collections import defaultdict

# parse command line arguments
source_dir = '../data/raw/amazon-walmart'
dest_dir = '../data/converted/amazon-walmart'
mapping_file = '../data/embeddings/glove-300.map'
column_idxs = [3,4,5,7,8,9,10]

verbose = True

set1 = 'set1.csv'
set2 = 'set2.csv'
matches = 'matches.csv'

In [290]:
if verbose:
    print('Loading datasets and maps.')
# load data
# df_pos is loaded so that it can be copied to destination directory
df1 = pd.read_csv(os.path.join(source_dir, set1), encoding = "latin1")
df2 = pd.read_csv(os.path.join(source_dir, set2), encoding = "latin1")
df_pos = pd.read_csv(os.path.join(source_dir, matches), encoding = "latin1")

# load double dictionary
with open(mapping_file, 'rb') as f:
    map = pkl.load(f)

# change word2idx to a default dictionary that outputs 0 for unknown tokens
map['word2idx'] = defaultdict(int, map['word2idx'])

Loading datasets and maps.


In [291]:
def clean_text(x):
    "formats a single string"
    if not isinstance(x, str):
        return 'NaN'
    
    # separate possessives with spaces
    x = x.replace('\'s', ' \'s')
    
    # convert html escape characters to regular characters
    x = html.unescape(x)
    
    # separate punctuations with spaces
    def pad(x):
        match = re.findall(r'.', x[0])[0]
        match_clean = ' ' + match + ' '
        return match_clean
    rx = r'\(|\)|/|!|#|\$|%|&|\\|\*|\+|,|:|;|<|=|>|\?|@|\[|\]|\^|_|{|}|\||'
    rx += r'`|~'
    x = re.sub(rx, pad, x)
    
    # remove decimal parts of version numbers
    def v_int(x):
        return re.sub('\.\d+','',x[0])
    x = re.sub(r'v\d+\.\d+', v_int, x)
    
    return x

if verbose:
    print('Cleaning the following columns from set1:')
    for column in df1.columns[column_idxs]:
        print(column, end=' ')
    print()
    print('Cleaning the following columns from set2:')
    for column in df2.columns[column_idxs]:
        print(column, end=' ')
    print()

df1.iloc[:, column_idxs] = df1.iloc[:, column_idxs].applymap(clean_text)
df2.iloc[:, column_idxs] = df2.iloc[:, column_idxs].applymap(clean_text)

Cleaning the following columns from set1:
brand pcategory1 title techdetails proddescrshort proddescrlong imageurl 
Cleaning the following columns from set2:
brand groupname title shelfdescr shortdescr longdescr imageurl 


In [None]:
def record2idx(x):
    x = x.split()
    for i, token in enumerate(x):
        idx = map['word2idx'][token]
        if idx == 0:
            idx = map['word2idx'][token.lower()]
        if idx == 0:
            idx = map['word2idx'][string.capwords(token)]
        if idx == 0:
            idx = map['word2idx'][token.upper()]
        x[i] = idx
    return x

if verbose:
    print('Converting tokens to indices.')
df1.iloc[:, column_idxs] = df1.iloc[:, column_idxs].applymap(record2idx)
df2.iloc[:, column_idxs] = df2.iloc[:, column_idxs].applymap(record2idx)

if not os.path.isdir(dest_dir):
    os.mkdir(dest_dir)
    if verbose:
        print('Creating destination directory')

In [264]:
    
df1.to_csv(os.path.join(dest_dir, set1), index=False)
df2.to_csv(os.path.join(dest_dir, set2), index=False)
df_pos.to_csv(os.path.join(dest_dir, matches), index=False)

Overwriting convert-text.py
