In [1]:
import os
import pandas as pd
from pdfquery import PDFQuery
from nltk.tokenize import wordpunct_tokenize
from nltk.stem import SnowballStemmer
import string

inputDir = os.getcwd()+'\\data'

In [2]:
#Load pdf to memory
pdf = PDFQuery(inputDir+'\\FYP-CoffeeAttribute.pdf')
pdf.load()

#Output simplified pdf structure
pdf.tree.write(inputDir+'\\FYP-CoffeeAttribute-XML.txt', pretty_print=True)

In [3]:
#Extract section
sectionList = pdf.pq("LTPage[pageid='4'] LTTextBoxHorizontal, LTPage[pageid='5'] LTTextBoxHorizontal")
len(sectionList), sectionList[0]

(20, <Element LTTextBoxHorizontal at 0x24252fae530>)

In [4]:
#Extract data
categoryList = []
tagsList = []

for i in range(len(sectionList)):
    section = sectionList[i].cssselect('LTTextLineHorizontal')
    
    try:
        category = section[0]
    except:
        continue
    
    if category.attrib['height'] == '16.455':
        categoryList.append(category.text)
        tagsList.append([node.text for node in section[1:]])
    else:
        tagsList[-1] += [node.text for node in section]
        
len(categoryList), len(tagsList)

(17, 17)

In [5]:
df = pd.DataFrame(data={
    "Category": categoryList,
    "Tags": tagsList
})
df.head()

Unnamed: 0,Category,Tags
0,Taste Basics ....................................,"[Sweet , Sour , Bitter , Salty ]"
1,Alcohol/Fermented ...............................,"[Alcohol , Whiskey , Winey , Fermented , Overr..."
2,Fruity ..........................................,"[Fruity , Berry , Strawberry , Raspberry , Blu..."
3,Sour/Acid .......................................,"[Sour , Sour Aromatics , Acetic acid , Butyric..."
4,Green/Vegetative ................................,"[Olive Oil , Raw , Under–ripe , Peapod , Green..."


In [6]:
charList = ['.'] + [str(i) for i in range(0, 10)]
df.Category = df.Category.apply(lambda x: ''.join([i for i in x if i not in charList]))
df.Category = df.Category.apply(lambda x: x[:-2])
df.head()

Unnamed: 0,Category,Tags
0,Taste Basics,"[Sweet , Sour , Bitter , Salty ]"
1,Alcohol/Fermented,"[Alcohol , Whiskey , Winey , Fermented , Overr..."
2,Fruity,"[Fruity , Berry , Strawberry , Raspberry , Blu..."
3,Sour/Acid,"[Sour , Sour Aromatics , Acetic acid , Butyric..."
4,Green/Vegetative,"[Olive Oil , Raw , Under–ripe , Peapod , Green..."


In [7]:
df.Tags = df.Tags.apply(lambda x: wordpunct_tokenize(' '.join(x)))
df.head()

Unnamed: 0,Category,Tags
0,Taste Basics,"[Sweet, Sour, Bitter, Salty]"
1,Alcohol/Fermented,"[Alcohol, Whiskey, Winey, Fermented, Overripe,..."
2,Fruity,"[Fruity, Berry, Strawberry, Raspberry, Blueber..."
3,Sour/Acid,"[Sour, Sour, Aromatics, Acetic, acid, Butyric,..."
4,Green/Vegetative,"[Olive, Oil, Raw, Under, –, ripe, Peapod, Gree..."


In [8]:
charList = string.punctuation + '–'
df.Tags = df.Tags.apply(lambda x: [i for i in x if i not in charList])
df.head()

Unnamed: 0,Category,Tags
0,Taste Basics,"[Sweet, Sour, Bitter, Salty]"
1,Alcohol/Fermented,"[Alcohol, Whiskey, Winey, Fermented, Overripe,..."
2,Fruity,"[Fruity, Berry, Strawberry, Raspberry, Blueber..."
3,Sour/Acid,"[Sour, Sour, Aromatics, Acetic, acid, Butyric,..."
4,Green/Vegetative,"[Olive, Oil, Raw, Under, ripe, Peapod, Green, ..."


In [9]:
df = df.explode('Tags').reset_index(drop=True)
df.Tags = df.Tags.str.capitalize()
df.head()

Unnamed: 0,Category,Tags
0,Taste Basics,Sweet
1,Taste Basics,Sour
2,Taste Basics,Bitter
3,Taste Basics,Salty
4,Alcohol/Fermented,Alcohol


In [10]:
stemmer = SnowballStemmer(language='english')
df['ProcessedTags'] = df.Tags.apply(lambda x: stemmer.stem(x))
df.head()

Unnamed: 0,Category,Tags,ProcessedTags
0,Taste Basics,Sweet,sweet
1,Taste Basics,Sour,sour
2,Taste Basics,Bitter,bitter
3,Taste Basics,Salty,salti
4,Alcohol/Fermented,Alcohol,alcohol


In [11]:
df.to_csv(inputDir+'\\category.csv', index=False)