In [19]:
import subprocess
import pandas as pd
import fasttext

In [20]:
retail_data = pd.read_csv("train.csv")
retail_data.head()

Unnamed: 0,ImgId,title,description,categories
0,B000HYL1V6,TUNGSTEN SOLDER PICK WITH HANDLE,Solder Pick for picking up molten solder when ...,"Arts, Crafts & Sewing"
1,B00006HXWY,Write Right 98167 Screen Protector for Sony T615C,We all screen. And we all need to protect thos...,Cell Phones & Accessories
2,B000GAWSBS,Casio Mens DBC310-1 Databank 300 Digital Watch...,"Bringing you precision at a glance, the Casio ...","Clothing, Shoes & Jewelry"
3,B000040JOL,Factory-Reconditioned DEWALT DW260KR Heavy-Dut...,Factory-Reconditioned DEWALT DW260KR Heavy-Dut...,Tools & Home Improvement
4,B00006IB78,Energizer 2 in 1 Light,This twoway light features a bright flashlight...,Health & Personal Care


In [21]:
# Delete the ImdId column
img_id_col = retail_data.columns[0]
retail_data_processed = retail_data.drop([img_id_col], axis=1)
retail_data_processed.head()

Unnamed: 0,title,description,categories
0,TUNGSTEN SOLDER PICK WITH HANDLE,Solder Pick for picking up molten solder when ...,"Arts, Crafts & Sewing"
1,Write Right 98167 Screen Protector for Sony T615C,We all screen. And we all need to protect thos...,Cell Phones & Accessories
2,Casio Mens DBC310-1 Databank 300 Digital Watch...,"Bringing you precision at a glance, the Casio ...","Clothing, Shoes & Jewelry"
3,Factory-Reconditioned DEWALT DW260KR Heavy-Dut...,Factory-Reconditioned DEWALT DW260KR Heavy-Dut...,Tools & Home Improvement
4,Energizer 2 in 1 Light,This twoway light features a bright flashlight...,Health & Personal Care


In [22]:
# Make the categories column the first column
cols = retail_data_processed.columns.tolist()
cols = cols[-1:] + cols[:-1]
retail_data_processed = retail_data_processed[cols]
retail_data_processed.head()

Unnamed: 0,categories,title,description
0,"Arts, Crafts & Sewing",TUNGSTEN SOLDER PICK WITH HANDLE,Solder Pick for picking up molten solder when ...
1,Cell Phones & Accessories,Write Right 98167 Screen Protector for Sony T615C,We all screen. And we all need to protect thos...
2,"Clothing, Shoes & Jewelry",Casio Mens DBC310-1 Databank 300 Digital Watch...,"Bringing you precision at a glance, the Casio ..."
3,Tools & Home Improvement,Factory-Reconditioned DEWALT DW260KR Heavy-Dut...,Factory-Reconditioned DEWALT DW260KR Heavy-Dut...
4,Health & Personal Care,Energizer 2 in 1 Light,This twoway light features a bright flashlight...


In [23]:
# Replace the spaces in the first column with underscores
retail_data_processed["categories"] = retail_data_processed["categories"].str.replace(" ", "-")
retail_data_processed["categories"] = retail_data_processed["categories"].str.replace(",", "")
retail_data_processed["categories"] 

0             Arts-Crafts-&-Sewing
1        Cell-Phones-&-Accessories
2         Clothing-Shoes-&-Jewelry
3         Tools-&-Home-Improvement
4           Health-&-Personal-Care
                   ...            
46224            Sports-&-Outdoors
46225         Arts-Crafts-&-Sewing
46226         Arts-Crafts-&-Sewing
46227       Grocery-&-Gourmet-Food
46228         Arts-Crafts-&-Sewing
Name: categories, Length: 46229, dtype: object

In [24]:
# Save the processed .csv file
retail_data_processed.to_csv("train.processed.csv", index=False)

In [25]:
# Normalizing the text
process = subprocess.run("""
    cat /home/agastya/Programming/retail/train.processed.csv | \
    tr '[:upper:]' '[:lower:]' | \
    sed -e 's/^/__label__/g' | \
    sed \
        -e "s/'/ ' /g" -e 's/"//g' \
        -e 's/\./ \. /g' \
        -e 's/<br \/>/ /g' \
        -e 's/,/ , /g' \
        -e 's/(/ ( /g' \
        -e 's/)/ ) /g' \
        -e 's/\!/ \! /g' \
        -e 's/\?/ \? /g' \
        -e 's/\;/ /g' \
        -e 's/\:/ /g' | tr -s " " | \
    perl -MList::Util=shuffle \
        -e 'print shuffle(<>);' "$@";
""", capture_output=True, shell=True)

output = process.stdout.decode("utf-8")

In [26]:
# Save the output to a file
with open("retail.train", "w") as training_file:
    training_file.write(output)

In [27]:
# Create the fasttext model and save it
model = fasttext.train_supervised(input="retail.train")
model.save_model("retail.bin")

Read 1M wordsRead 2M wordsRead 3M wordsRead 4M words

Read 4M words
Number of words:  151371
Number of labels: 1991
Progress:   0.8% words/sec/thread:  119466 lr:  0.099223 avg.loss: 17.261974 ETA:   0h 0m12s

Progress:   1.5% words/sec/thread:  114597 lr:  0.098514 avg.loss: 11.385130 ETA:   0h 0m13sProgress:   2.3% words/sec/thread:  118581 lr:  0.097696 avg.loss:  9.173843 ETA:   0h 0m12s

Progress:   3.1% words/sec/thread:  118097 lr:  0.096942 avg.loss:  8.113221 ETA:   0h 0m12sProgress:   3.7% words/sec/thread:  115031 lr:  0.096278 avg.loss:  7.477741 ETA:   0h 0m12sProgress:   4.5% words/sec/thread:  116613 lr:  0.095473 avg.loss:  7.096852 ETA:   0h 0m12sProgress:   5.4% words/sec/thread:  118462 lr:  0.094636 avg.loss:  6.620281 ETA:   0h 0m12s

Progress:   6.2% words/sec/thread:  119786 lr:  0.093802 avg.loss:  6.291380 ETA:   0h 0m12sProgress:   7.0% words/sec/thread:  120282 lr:  0.092999 avg.loss:  5.980684 ETA:   0h 0m11sProgress:   7.8% words/sec/thread:  121102 lr:  0.092169 avg.loss:  5.843977 ETA:   0h 0m11s

Progress:   8.7% words/sec/thread:  122783 lr:  0.091267 avg.loss:  5.684089 ETA:   0h 0m11sProgress:   9.5% words/sec/thread:  123075 lr:  0.090451 avg.loss:  5.562494 ETA:   0h 0m11sProgress:  10.4% words/sec/thread:  124253 lr:  0.089556 avg.loss:  5.497750 ETA:   0h 0m11s

Progress:  11.3% words/sec/thread:  124654 lr:  0.088717 avg.loss:  5.390522 ETA:   0h 0m11sProgress:  12.1% words/sec/thread:  125246 lr:  0.087854 avg.loss:  5.281600 ETA:   0h 0m10sProgress:  13.0% words/sec/thread:  125435 lr:  0.087026 avg.loss:  5.194226 ETA:   0h 0m10s

Progress:  13.8% words/sec/thread:  125899 lr:  0.086164 avg.loss:  5.108814 ETA:   0h 0m10sProgress:  14.7% words/sec/thread:  126334 lr:  0.085300 avg.loss:  5.025107 ETA:   0h 0m10sProgress:  15.6% words/sec/thread:  126655 lr:  0.084444 avg.loss:  4.963343 ETA:   0h 0m10s

Progress:  16.4% words/sec/thread:  126793 lr:  0.083608 avg.loss:  4.933255 ETA:   0h 0m10sProgress:  17.2% words/sec/thread:  126754 lr:  0.082793 avg.loss:  4.870368 ETA:   0h 0m10sProgress:  18.0% words/sec/thread:  126883 lr:  0.081956 avg.loss:  4.843504 ETA:   0h 0m10s

Progress:  18.9% words/sec/thread:  127237 lr:  0.081081 avg.loss:  4.795201 ETA:   0h 0m 9sProgress:  19.7% words/sec/thread:  127171 lr:  0.080268 avg.loss:  4.760939 ETA:   0h 0m 9sProgress:  20.5% words/sec/thread:  127085 lr:  0.079461 avg.loss:  4.718992 ETA:   0h 0m 9s

Progress:  21.4% words/sec/thread:  127206 lr:  0.078619 avg.loss:  4.675697 ETA:   0h 0m 9sProgress:  22.2% words/sec/thread:  127394 lr:  0.077764 avg.loss:  4.619425 ETA:   0h 0m 9sProgress:  23.1% words/sec/thread:  127573 lr:  0.076910 avg.loss:  4.572955 ETA:   0h 0m 9s

Progress:  24.0% words/sec/thread:  127974 lr:  0.076009 avg.loss:  4.532355 ETA:   0h 0m 9sProgress:  24.8% words/sec/thread:  128059 lr:  0.075166 avg.loss:  4.474617 ETA:   0h 0m 9sProgress:  25.6% words/sec/thread:  127921 lr:  0.074366 avg.loss:  4.433192 ETA:   0h 0m 9s

Progress:  26.4% words/sec/thread:  127773 lr:  0.073570 avg.loss:  4.383660 ETA:   0h 0m 8sProgress:  27.3% words/sec/thread:  127841 lr:  0.072730 avg.loss:  4.339062 ETA:   0h 0m 8s

Progress:  28.1% words/sec/thread:  127817 lr:  0.071908 avg.loss:  4.294326 ETA:   0h 0m 8sProgress:  28.9% words/sec/thread:  127930 lr:  0.071056 avg.loss:  4.262712 ETA:   0h 0m 8sProgress:  29.7% words/sec/thread:  127801 lr:  0.070260 avg.loss:  4.215266 ETA:   0h 0m 8s

Progress:  30.6% words/sec/thread:  127848 lr:  0.069423 avg.loss:  4.197045 ETA:   0h 0m 8sProgress:  31.4% words/sec/thread:  127951 lr:  0.068572 avg.loss:  4.180823 ETA:   0h 0m 8sProgress:  32.3% words/sec/thread:  127943 lr:  0.067747 avg.loss:  4.146604 ETA:   0h 0m 8s

Progress:  33.2% words/sec/thread:  128282 lr:  0.066833 avg.loss:  4.120266 ETA:   0h 0m 8sProgress:  34.0% words/sec/thread:  128353 lr:  0.065985 avg.loss:  4.094366 ETA:   0h 0m 7sProgress:  34.9% words/sec/thread:  128381 lr:  0.065148 avg.loss:  4.065730 ETA:   0h 0m 7s

Progress:  35.7% words/sec/thread:  128402 lr:  0.064312 avg.loss:  4.035918 ETA:   0h 0m 7sProgress:  36.5% words/sec/thread:  128484 lr:  0.063460 avg.loss:  4.024301 ETA:   0h 0m 7sProgress:  37.4% words/sec/thread:  128469 lr:  0.062634 avg.loss:  4.009199 ETA:   0h 0m 7s

Progress:  38.2% words/sec/thread:  128568 lr:  0.061755 avg.loss:  3.987891 ETA:   0h 0m 7sProgress:  39.1% words/sec/thread:  128604 lr:  0.060913 avg.loss:  3.970962 ETA:   0h 0m 7s

Progress:  39.9% words/sec/thread:  128657 lr:  0.060066 avg.loss:  3.942963 ETA:   0h 0m 7sProgress:  40.8% words/sec/thread:  128612 lr:  0.059201 avg.loss:  3.914120 ETA:   0h 0m 7sProgress:  41.6% words/sec/thread:  128658 lr:  0.058355 avg.loss:  3.900165 ETA:   0h 0m 7sProgress:  42.6% words/sec/thread:  128904 lr:  0.057443 avg.loss:  3.884456 ETA:   0h 0m 6s

Progress:  43.4% words/sec/thread:  128942 lr:  0.056598 avg.loss:  3.861398 ETA:   0h 0m 6sProgress:  44.3% words/sec/thread:  129028 lr:  0.055734 avg.loss:  3.839608 ETA:   0h 0m 6sProgress:  45.1% words/sec/thread:  129044 lr:  0.054895 avg.loss:  3.823220 ETA:   0h 0m 6s

Progress:  46.0% words/sec/thread:  129144 lr:  0.054025 avg.loss:  3.796738 ETA:   0h 0m 6sProgress:  46.7% words/sec/thread:  128982 lr:  0.053250 avg.loss:  3.790849 ETA:   0h 0m 6sProgress:  47.6% words/sec/thread:  129033 lr:  0.052398 avg.loss:  3.772069 ETA:   0h 0m 6s

Progress:  48.5% words/sec/thread:  129196 lr:  0.051503 avg.loss:  3.738468 ETA:   0h 0m 6sProgress:  49.3% words/sec/thread:  129189 lr:  0.050671 avg.loss:  3.723294 ETA:   0h 0m 6sProgress:  50.2% words/sec/thread:  129262 lr:  0.049808 avg.loss:  3.702379 ETA:   0h 0m 5s

Progress:  51.0% words/sec/thread:  129284 lr:  0.048964 avg.loss:  3.684172 ETA:   0h 0m 5sProgress:  51.9% words/sec/thread:  129307 lr:  0.048119 avg.loss:  3.657704 ETA:   0h 0m 5sProgress:  52.7% words/sec/thread:  129320 lr:  0.047278 avg.loss:  3.641035 ETA:   0h 0m 5s

Progress:  53.6% words/sec/thread:  129475 lr:  0.046379 avg.loss:  3.612195 ETA:   0h 0m 5sProgress:  54.5% words/sec/thread:  129466 lr:  0.045546 avg.loss:  3.598679 ETA:   0h 0m 5sProgress:  55.3% words/sec/thread:  129476 lr:  0.044706 avg.loss:  3.587256 ETA:   0h 0m 5s

Progress:  56.1% words/sec/thread:  129466 lr:  0.043873 avg.loss:  3.561169 ETA:   0h 0m 5sProgress:  57.0% words/sec/thread:  129473 lr:  0.043033 avg.loss:  3.547189 ETA:   0h 0m 5sProgress:  57.8% words/sec/thread:  129432 lr:  0.042215 avg.loss:  3.528575 ETA:   0h 0m 5s

Progress:  58.6% words/sec/thread:  129434 lr:  0.041378 avg.loss:  3.518311 ETA:   0h 0m 4sProgress:  59.5% words/sec/thread:  129404 lr:  0.040515 avg.loss:  3.501824 ETA:   0h 0m 4s

Progress:  60.3% words/sec/thread:  129308 lr:  0.039722 avg.loss:  3.485744 ETA:   0h 0m 4sProgress:  61.2% words/sec/thread:  129465 lr:  0.038812 avg.loss:  3.470934 ETA:   0h 0m 4sProgress:  62.0% words/sec/thread:  129448 lr:  0.037984 avg.loss:  3.450930 ETA:   0h 0m 4s

Progress:  62.9% words/sec/thread:  129606 lr:  0.037071 avg.loss:  3.430592 ETA:   0h 0m 4sProgress:  63.8% words/sec/thread:  129634 lr:  0.036220 avg.loss:  3.412835 ETA:   0h 0m 4sProgress:  64.6% words/sec/thread:  129696 lr:  0.035352 avg.loss:  3.401634 ETA:   0h 0m 4s

Progress:  65.5% words/sec/thread:  129694 lr:  0.034514 avg.loss:  3.387097 ETA:   0h 0m 4sProgress:  66.3% words/sec/thread:  129609 lr:  0.033719 avg.loss:  3.369549 ETA:   0h 0m 4sProgress:  67.1% words/sec/thread:  129631 lr:  0.032871 avg.loss:  3.357184 ETA:   0h 0m 3s

Progress:  68.0% words/sec/thread:  129679 lr:  0.032008 avg.loss:  3.340980 ETA:   0h 0m 3sProgress:  68.8% words/sec/thread:  129709 lr:  0.031155 avg.loss:  3.333533 ETA:   0h 0m 3sProgress:  69.7% words/sec/thread:  129749 lr:  0.030295 avg.loss:  3.326636 ETA:   0h 0m 3s

Progress:  70.5% words/sec/thread:  129742 lr:  0.029460 avg.loss:  3.309011 ETA:   0h 0m 3sProgress:  71.4% words/sec/thread:  129793 lr:  0.028593 avg.loss:  3.293938 ETA:   0h 0m 3sProgress:  72.2% words/sec/thread:  129769 lr:  0.027768 avg.loss:  3.285998 ETA:   0h 0m 3s

Progress:  73.1% words/sec/thread:  129802 lr:  0.026911 avg.loss:  3.272156 ETA:   0h 0m 3sProgress:  74.0% words/sec/thread:  129884 lr:  0.026024 avg.loss:  3.255350 ETA:   0h 0m 3sProgress:  74.9% words/sec/thread:  130012 lr:  0.025113 avg.loss:  3.244059 ETA:   0h 0m 2s

Progress:  75.7% words/sec/thread:  129960 lr:  0.024302 avg.loss:  3.231687 ETA:   0h 0m 2sProgress:  76.4% words/sec/thread:  129807 lr:  0.023552 avg.loss:  3.220623 ETA:   0h 0m 2sProgress:  77.3% words/sec/thread:  129818 lr:  0.022707 avg.loss:  3.206919 ETA:   0h 0m 2s

Progress:  78.1% words/sec/thread:  129757 lr:  0.021905 avg.loss:  3.194349 ETA:   0h 0m 2sProgress:  79.0% words/sec/thread:  129787 lr:  0.021049 avg.loss:  3.182158 ETA:   0h 0m 2sProgress:  79.8% words/sec/thread:  129814 lr:  0.020193 avg.loss:  3.173255 ETA:   0h 0m 2s

Progress:  80.6% words/sec/thread:  129796 lr:  0.019364 avg.loss:  3.166542 ETA:   0h 0m 2sProgress:  81.5% words/sec/thread:  129815 lr:  0.018514 avg.loss:  3.158043 ETA:   0h 0m 2sProgress:  82.3% words/sec/thread:  129833 lr:  0.017664 avg.loss:  3.154111 ETA:   0h 0m 2s

Progress:  83.2% words/sec/thread:  129873 lr:  0.016799 avg.loss:  3.145162 ETA:   0h 0m 2sProgress:  84.1% words/sec/thread:  129908 lr:  0.015937 avg.loss:  3.137632 ETA:   0h 0m 1s

Progress:  84.9% words/sec/thread:  129864 lr:  0.015126 avg.loss:  3.129253 ETA:   0h 0m 1sProgress:  85.7% words/sec/thread:  129825 lr:  0.014313 avg.loss:  3.123075 ETA:   0h 0m 1sProgress:  86.5% words/sec/thread:  129817 lr:  0.013480 avg.loss:  3.119539 ETA:   0h 0m 1sProgress:  87.4% words/sec/thread:  129835 lr:  0.012628 avg.loss:  3.111059 ETA:   0h 0m 1s

Progress:  88.2% words/sec/thread:  129799 lr:  0.011813 avg.loss:  3.100873 ETA:   0h 0m 1sProgress:  89.1% words/sec/thread:  129853 lr:  0.010938 avg.loss:  3.095770 ETA:   0h 0m 1s

Progress:  89.9% words/sec/thread:  129862 lr:  0.010093 avg.loss:  3.092195 ETA:   0h 0m 1sProgress:  90.7% words/sec/thread:  129863 lr:  0.009252 avg.loss:  3.080538 ETA:   0h 0m 1sProgress:  91.6% words/sec/thread:  129922 lr:  0.008372 avg.loss:  3.070590 ETA:   0h 0m 0sProgress:  92.5% words/sec/thread:  129942 lr:  0.007518 avg.loss:  3.057617 ETA:   0h 0m 0s

Progress:  93.4% words/sec/thread:  129989 lr:  0.006578 avg.loss:  3.047613 ETA:   0h 0m 0sProgress:  94.3% words/sec/thread:  129998 lr:  0.005731 avg.loss:  3.039155 ETA:   0h 0m 0sProgress:  95.1% words/sec/thread:  130024 lr:  0.004872 avg.loss:  3.032368 ETA:   0h 0m 0s

Progress:  96.0% words/sec/thread:  130015 lr:  0.004038 avg.loss:  3.027903 ETA:   0h 0m 0sProgress:  96.8% words/sec/thread:  130030 lr:  0.003187 avg.loss:  3.021662 ETA:   0h 0m 0sProgress:  97.6% words/sec/thread:  130012 lr:  0.002361 avg.loss:  3.012515 ETA:   0h 0m 0s

Progress:  98.5% words/sec/thread:  130042 lr:  0.001497 avg.loss:  3.000929 ETA:   0h 0m 0sProgress:  99.4% words/sec/thread:  130059 lr:  0.000644 avg.loss:  2.995142 ETA:   0h 0m 0sProgress: 

100.0% words/sec/thread:  129811 lr: -0.000006 avg.loss:  2.990862 ETA:   0h 0m 0sProgress: 100.0% words/sec/thread:  129810 lr:  0.000000 avg.loss:  2.990862 ETA:   0h 0m 0s


In [28]:
model.predict('TV')

(('__label__',), array([0.83487737]))