In [1]:
import pandas as pd
import csv
import shutil
import os

# Summary

This notebook is just to demonstrate some basic data wrangling for testing purposes
We need to create test fixtures to test our indexer.
We have a CSV file containing ~150k wine reviews (https://www.kaggle.com/zynicide/wine-reviews)

We want to create a separate file for each review, grabbing only the "id" and "description" in this
case to provide a simple example.

- The first line of the file will contain the id. (will use as document id in our index)

- The second line of the file will contain the review.

In [2]:
WINE_REVIEWS = '../Javelin.Tests/TestFixtures/winemag-data_first150k.csv'
df = pd.read_csv(WINE_REVIEWS)

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,variety,winery
0,0,US,This tremendous 100% varietal wine hails from ...,Martha's Vineyard,96,235.0,California,Napa Valley,Napa,Cabernet Sauvignon,Heitz
1,1,Spain,"Ripe aromas of fig, blackberry and cassis are ...",Carodorum Selección Especial Reserva,96,110.0,Northern Spain,Toro,,Tinta de Toro,Bodega Carmen Rodríguez
2,2,US,Mac Watson honors the memory of a wine once ma...,Special Selected Late Harvest,96,90.0,California,Knights Valley,Sonoma,Sauvignon Blanc,Macauley
3,3,US,"This spent 20 months in 30% new French oak, an...",Reserve,96,65.0,Oregon,Willamette Valley,Willamette Valley,Pinot Noir,Ponzi
4,4,France,"This is the top wine from La Bégude, named aft...",La Brûlade,95,66.0,Provence,Bandol,,Provence red blend,Domaine de la Bégude


In [4]:
# we could use the DataFrame index here as an ID, 
# but we're going stay true to the ID provided in the file
# Note that this is arbitrary - we have no idea what's in the file,
# and if we have duplicate values this may cause issues during indexing

df.rename(columns={ df.columns[0]: "id" }, inplace = True)

In [5]:
df[['id','description']].head()

Unnamed: 0,id,description
0,0,This tremendous 100% varietal wine hails from ...
1,1,"Ripe aromas of fig, blackberry and cassis are ..."
2,2,Mac Watson honors the memory of a wine once ma...
3,3,"This spent 20 months in 30% new French oak, an..."
4,4,"This is the top wine from La Bégude, named aft..."


In [6]:
len(df)

150930

In [7]:
# From this dataset for testing, our search engine will index ~150k documents.

In [9]:
for x in df.iterrows():
    # iterrows => a tuple per record
    # X[0] is the index
    # X[1] is a tuple of the rows values so X[1][0] is the value of the first column etc.
    if x[1]['id'] > 10_000:
        break
    filename = '../Javelin.Tests/TestFixtures/Data/' + str(x[1]['id']).zfill(6) + '.txt'
    with open(filename, 'w+') as f:
        f.write(x[1]['description'])

In [10]:
def get_size(start_path):
    total_size = 0
    for dirpath, dirnames, filenames in os.walk(start_path):
        for f in filenames:
            fp = os.path.join(dirpath, f)
            # skip if it is symbolic link
            if not os.path.islink(fp):
                total_size += os.path.getsize(fp)

    return total_size

In [11]:
size_bytes = get_size('../Javelin.Tests/TestFixtures/Data/')

In [12]:
import math

def convert_size(size_bytes):
    if size_bytes == 0:
        return "0B"
    size_name = ("B", "KB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB")
    i = int(math.floor(math.log(size_bytes, 1024)))
    p = math.pow(1024, i)
    s = round(size_bytes / p, 2)
    return "%s %s" % (s, size_name[i])

In [13]:
convert_size(size_bytes)

'2.32 MB'

In [14]:
shutil.make_archive('../Javelin.Tests/TestFixtures/Data', 'zip', '../Javelin.Tests/TestFixtures/Data')

'/Users/wes/projects/Javelin.NET/Javelin.Tests/TestFixtures/Data.zip'