# Overview

**Goal**

Extract makeup category, brand, product name/line, and shade mentions from each makeup tutorial (of an instagram account) into a dataframe that I will perform exploratory data analysis on and/or use to build a basic recommender (purposely scraped data for medium-tan skintones). Focusing on makeup because I'm curious about colors and their creative combinations.

**Ideal output**

Groupdict to convert into dataframe

{'category': foundation, 'brand': @hudabeauty, 'product': #fauxfilter, 'shade': 'tres leches'; 'category': concealer, 'brand': @lancomeofficial, 'product': 'teint idole ultra wear, 'shade': '220 buff'; 'category': bronzer, 'brand': @kikomilanousa, 'product': None, 'shade': 'bronze melange'; 'category': lips, 'brand': @ofracosmetics, 'product': None, 'shade':'mocha'}

**Notebook contents**

1. Single tutorial post
2. Top six patterns across posts
3. Sample dataset of data scraped from an individual beauty blogger's account on instagram

**Note:** Worked in Python 2.7

# Practice on single tutorial post

In [142]:
import re
import pandas as pd
import nltk
from nltk import regexp_tokenize, sent_tokenize
import os

post1 = 'paox33Matte eyes and glossy lips üòã featuring my baby hairsüë∂üèªüôÉEyes : @urbandecaycosmetics #urbandecay #nakedheatpalette Lashes : @lillylashes #lillylashes "lush" Mascara : @eyeko #eyekolashalertmascara Lips : @doseofcolors "knock on wood " topped off with @beccacosmetics #BECCAGlowGloss opal x jadeFoundation : @frankierosecosmetics #frankierosecosmetics shade "gold" coupon code for discount (paox) Brows : @benefitcosmetics #benefitbrows kabrow 04Bronzer : @benefitcosmetics #hoolabronzer Highlighter : @kikomilanousa glow fusion shade "03" used it on the inner corner of the eye as well #kikomilano #paox33 #hudabeauty #wakeupandmakeup #vegas_nay #motd #makeup #lillyghalichi139w'
post1

'paox33Matte eyes and glossy lips \xf0\x9f\x98\x8b featuring my baby hairs\xf0\x9f\x91\xb6\xf0\x9f\x8f\xbb\xf0\x9f\x99\x83Eyes : @urbandecaycosmetics #urbandecay #nakedheatpalette Lashes : @lillylashes #lillylashes "lush" Mascara : @eyeko #eyekolashalertmascara Lips : @doseofcolors "knock on wood " topped off with @beccacosmetics #BECCAGlowGloss opal x jadeFoundation : @frankierosecosmetics #frankierosecosmetics shade "gold" coupon code for discount (paox) Brows : @benefitcosmetics #benefitbrows kabrow 04Bronzer : @benefitcosmetics #hoolabronzer Highlighter : @kikomilanousa glow fusion shade "03" used it on the inner corner of the eye as well #kikomilano #paox33 #hudabeauty #wakeupandmakeup #vegas_nay #motd #makeup #lillyghalichi139w'

In [143]:
#Lowercase text
post1 = post1.lower()

#Remove emojis
emoji = re.compile(
    u"(\ud83d[\ude00-\ude4f])|"  # emoticons
    u"(\ud83c[\udf00-\uffff])|"  # symbols & pictographs (1 of 2)
    u"(\ud83d[\u0000-\uddff])|"  # symbols & pictographs (2 of 2)
    u"(\ud83d[\ude80-\udeff])|"  # transport & map symbols
    u"(\ud83c[\udde0-\uddff])"  # flags (iOS)
    "+", flags=re.UNICODE)

#post1.decode('utf-8')
post1 = re.sub(emoji, " ", post1)
post1

'paox33matte eyes and glossy lips \xf0\x9f\x98\x8b featuring my baby hairs\xf0\x9f\x91\xb6\xf0\x9f\x8f\xbb\xf0\x9f\x99\x83eyes : @urbandecaycosmetics #urbandecay #nakedheatpalette lashes : @lillylashes #lillylashes "lush" mascara : @eyeko #eyekolashalertmascara lips : @doseofcolors "knock on wood " topped off with @beccacosmetics #beccaglowgloss opal x jadefoundation : @frankierosecosmetics #frankierosecosmetics shade "gold" coupon code for discount (paox) brows : @benefitcosmetics #benefitbrows kabrow 04bronzer : @benefitcosmetics #hoolabronzer highlighter : @kikomilanousa glow fusion shade "03" used it on the inner corner of the eye as well #kikomilano #paox33 #hudabeauty #wakeupandmakeup #vegas_nay #motd #makeup #lillyghalichi139w'

In [144]:
#Identify makeup categories; code for partitioning continuous strings in next section
re.findall(r"\w+\s\:+", post1)

['eyes :',
 'lashes :',
 'mascara :',
 'lips :',
 'jadefoundation :',
 'brows :',
 '04bronzer :',
 'highlighter :']

In [145]:
#Trim trailing whitespace (i.e. "foundation :" to "foundation:")
post1 = re.sub(r"(\s:\s)", ": ", post1)
post1

'paox33matte eyes and glossy lips \xf0\x9f\x98\x8b featuring my baby hairs\xf0\x9f\x91\xb6\xf0\x9f\x8f\xbb\xf0\x9f\x99\x83eyes: @urbandecaycosmetics #urbandecay #nakedheatpalette lashes: @lillylashes #lillylashes "lush" mascara: @eyeko #eyekolashalertmascara lips: @doseofcolors "knock on wood " topped off with @beccacosmetics #beccaglowgloss opal x jadefoundation: @frankierosecosmetics #frankierosecosmetics shade "gold" coupon code for discount (paox) brows: @benefitcosmetics #benefitbrows kabrow 04bronzer: @benefitcosmetics #hoolabronzer highlighter: @kikomilanousa glow fusion shade "03" used it on the inner corner of the eye as well #kikomilano #paox33 #hudabeauty #wakeupandmakeup #vegas_nay #motd #makeup #lillyghalichi139w'

In [146]:
#Insert new line breaks before category mention to mark ending of 'sentence' (i.e. category + brand + product + shade is one sentence)
#Couldn't figure out how to regexp_tokenize
post1 = re.sub(r"(?=\s\w+:)", ".",post1)
post1

'paox33matte eyes and glossy lips \xf0\x9f\x98\x8b featuring my baby hairs\xf0\x9f\x91\xb6\xf0\x9f\x8f\xbb\xf0\x9f\x99\x83eyes: @urbandecaycosmetics #urbandecay #nakedheatpalette. lashes: @lillylashes #lillylashes "lush". mascara: @eyeko #eyekolashalertmascara. lips: @doseofcolors "knock on wood " topped off with @beccacosmetics #beccaglowgloss opal x. jadefoundation: @frankierosecosmetics #frankierosecosmetics shade "gold" coupon code for discount (paox). brows: @benefitcosmetics #benefitbrows kabrow. 04bronzer: @benefitcosmetics #hoolabronzer. highlighter: @kikomilanousa glow fusion shade "03" used it on the inner corner of the eye as well #kikomilano #paox33 #hudabeauty #wakeupandmakeup #vegas_nay #motd #makeup #lillyghalichi139w'

In [147]:
#Quickly scan sentences created
re.findall(r"(\w+\:+)\s(\@\w+)+\s(.+?)\.", post1)

[('eyes:', '@urbandecaycosmetics', '#urbandecay #nakedheatpalette'),
 ('lashes:', '@lillylashes', '#lillylashes "lush"'),
 ('mascara:', '@eyeko', '#eyekolashalertmascara'),
 ('lips:',
  '@doseofcolors',
  '"knock on wood " topped off with @beccacosmetics #beccaglowgloss opal x'),
 ('jadefoundation:',
  '@frankierosecosmetics',
  '#frankierosecosmetics shade "gold" coupon code for discount (paox)'),
 ('brows:', '@benefitcosmetics', '#benefitbrows kabrow'),
 ('04bronzer:', '@benefitcosmetics', '#hoolabronzer')]

In [148]:
#Compile regex for category + brand + product + shade
r = re.compile(r'(?P<category>\w+\:+)\s+(?P<brand>\@\w+)\s[^."]*(?:"(?P<shade>.+?)")?[^."]*(?:\.\s+|$)')

#Convert matches into dictionary
for m in r.finditer(post1):
    print m.groupdict()
    
#Convert dictionary to dataframe
df = pd.DataFrame([m.group('category', 'brand', 'shade') for m in r.finditer(post1)], columns = ['Category', 'Brand', 'Shade'])
df['Category'] = df['Category'].str.replace(r"\:", "")
df['Brand'] = df['Brand'].str.replace(r"\@", "")
df

{'category': 'eyes:', 'brand': '@urbandecaycosmetics', 'shade': None}
{'category': 'lashes:', 'brand': '@lillylashes', 'shade': 'lush'}
{'category': 'mascara:', 'brand': '@eyeko', 'shade': None}
{'category': 'lips:', 'brand': '@doseofcolors', 'shade': 'knock on wood '}
{'category': 'jadefoundation:', 'brand': '@frankierosecosmetics', 'shade': 'gold'}
{'category': 'brows:', 'brand': '@benefitcosmetics', 'shade': None}
{'category': '04bronzer:', 'brand': '@benefitcosmetics', 'shade': None}
{'category': 'highlighter:', 'brand': '@kikomilanousa', 'shade': '03'}


Unnamed: 0,Category,Brand,Shade
0,eyes,urbandecaycosmetics,
1,lashes,lillylashes,lush
2,mascara,eyeko,
3,lips,doseofcolors,knock on wood
4,jadefoundation,frankierosecosmetics,gold
5,brows,benefitcosmetics,
6,04bronzer,benefitcosmetics,
7,highlighter,kikomilanousa,03


# Practice on dataset with top six patterns

**Key Questions:**

1. How to best remove emojis

2. How to deal with continuous strings (common continuous strings are in category mentions - i.e. jadeFoundation, 03concealer, whereby previous category's shade and next category phrase are bunched up)

3. How to ensure category matches to related brand + product + shade in groupdict (for now, I inserted periods before each category mention to mimick sentences/keep the phrases 'together')

4. How to capture groups of text with 2-4 regex patterns and when sometimes the groups of text don't exist

Note: I think category/product can be interchangable

In [218]:
google_sheet_url = 'https://docs.google.com/spreadsheets/d/1KjMgV-tUFVRhYo81tQd87Za6xqGhWFFG3oIETUNybOs/export?format=csv&gid=340079439'
df = pd.read_csv(google_sheet_url)

In [219]:
#Lowercase
df['Description'] = df['Description'].str.decode('utf-8')
df['Description'] = df['Description'].str.lower()
df

Unnamed: 0,Description
0,@hudabeautyshop brown sugar@fentybeauty 310@si...
1,‚Äî@anastasiabeverlyhills 350c foundation @diorm...
2,‚Äî@beautyblender serum primer@beautyblender fou...
3,‚Äî@hudabeautyshop brown sugar foundation@maccos...
4,‚Äî@fourthraybeauty coconut face milk@anastasiab...
5,eyes : @hudabeautyshop cinnamon toast & hella ...


**RegEx Pattern (1)**: [@brand + 'product'/'shade'] & [@brand + 'product'/'shade' + 'category'] &  [@brand + 'category' + 'shade']

In [211]:
df['Description'][0]

u'@hudabeautyshop brown sugar@fentybeauty 310@sigmabeauty kabuki brush use code \u2018aalia\u2019@fentybeauty truffle match stix @maybelline concealer \u2018sand\u2019@hudabeauty banana bread powder@makeupobsession throw shade for contour & highlight@bobbibrown clementine blush@benefitcosmeticsuk brow setter@maccosmeticsuk cork lipliner@colourpopcosmetics lippie stix in candy paint @aaliacosmetics lashes in \u2018aalia\u2019\u2014#makeuptutorial#hudabeauty#hudabeautyshop#hudabeautyeasybake#makeupartist#tutorialmakeup#colourpop#maccosmetics#undiscovered_muas#discover_muas#bobbibrown#benefitbrowbar#benefit#fentybeauty#makeuplooks#maybelline#maybellineconcealer#drugstoremakeup32w'

**RegEx Pattern (2)**: [@brand + 'shade' + 'category'] & [@brand + 'category' + 'shade'] & [@brand + 'product' & @brand + 'shade']

In [212]:
df['Description'][1]

u'\u2014@anastasiabeverlyhills 350c foundation @diormakeup foundation 3wo@bperfectcosmetics carnival palette@nyxcosmetics_uk esmeralda in the waterline @fentybeauty unbutton & @hudabeautyshop staycation on lips@kkwbeauty gloss on top@aaliacosmetics \u2018aalia\u2019 lashes3d'

**RegEx Pattern (3)**: [@brand + 'category'/'product' + 'shade'] & [@brand + 'product' + 'shade'] & [@brand + 'shade' + 'category'/'product']

In [213]:
df['Description'][2]

u'\u2014@beautyblender serum primer@beautyblender foundation in 3.75@marcjacobsbeauty concealer in 53@tartecosmetics shape tape med tan-sand @hudabeautyshop banana bread powder@maccosmeticsuk dark deep powder @bobbibrownuk clementine blush@hudabeauty golden sands highlight @maccosmetics cork lip pencil @colourpopcosmetics dream date lippie stix @benefitcosmeticsuk brow pencil 3.5@marcjacobsbeauty gloss in moon glow@aaliacosmetics lashes in \u2018aalia\u2019\u2014#hudabeauty#bananabread#huda#hudabeautypalette#hudabeauty3dhighlight#beauty#beautybloggers#beautyblender#slave2makeup#makeuptutorial#tutorial#tutorialmakeup#discover_muas#beautyblenderfoundation\u2014earrings @lovetifboutique30w'

**RegEx Pattern (4)**: [@brand + 'shade(s)' + 'category'/'product'] & [@brand + 'product' + 'shade(s)'] & ['category: ' + @brand + 'product' + 'shade(s)']

In [214]:
df['Description'][3]

u'\u2014@hudabeautyshop brown sugar foundation@maccosmeticsmiddleeast face&body c6@fentybeauty truffle matchstix@tartecosmetics shape tape medium tan sand@anastasiabeverlyhills contour kit@maccosmeticsuk mineralize skin finish dark deep@bellapierreofficialuk blusher palette@elfcosmeticsuk brow gel@kikomilano 208 as highlighter@aaliacosmetics \u2018aalia\u2019 lasheslips:@maccosmetics lip liners cork & etcetera @nyxcosmetics_arabia london soft matte lip cream\u2014#aaliacosmetics#hudabeauty#huda#brownsugar#undiscovered_muas#discover_muas#discovervideos#muasupport#nyx#nyxarabia#tartecosmetics#tarteshapetape#anastasiabeverlyhills#abh#abhcontourkit#fauxfilter#elfcosmetics#getreadywithme#makeupoftheday#makeuplife#dubailife#revolutionprlist40w'

**RegEx Pattern (5)**: [@brand + 'shade' + 'category/product'] & [@brand + 'category']

In [215]:
df['Description'][4]

u'\u2014@fourthraybeauty coconut face milk@anastasiabeverlyhills 365c foundation @opvbeauty contour palette @maybelline caramel concealer@hudabeautyshop banana bread powder & granola concealer@colourpopcosmetics talk to the palm bronzer@narsissist blush@lauragellerbeauty gilded honey @colourpopcosmetics going coconuts palette@stilacosmetics glitter@aaliacosmetics \u2018dreamer\u2019 lashes@colourpopcosmetics she\u2019s here gloss@hudabeauty \u2018wednesday\u2019 matte as liner @maccosmeticsuk chestnut lip liner@maybelline 65 matte liquid lipstick\u2014#aaliacosmetics#hudabeauty#anastasiabeverlyhills#undiscoveredmuas#discover_muas#discover#makeuplife#makeup#glossylips#colourpop#colourpopcosmetics#abhfoundation#fluffybrows#pinkliner#hudabeauty16w'

**RegEx Pattern**: 'category :'/'category@'/'category @' + brand + product/shade; brand + shade + category/product

In [216]:
df['Description'][5]

u'eyes : @hudabeautyshop cinnamon toast & hella honey melted shadow@hudabeauty life liner@hudabeauty samantha lashes.face :@guerlain 04w + 05w l\u2019essentiel foundation@milkmakeup medium beige flex concealer@coverfx suntan bronze monochromatic bronzer duo@hudabeautyshop gold sands 3d highlighter palette@lilahbeauty aglow face mist.lips :@nyxcosmetics_canada after hours lipstick@sammarcelbeauty angeline satin lipstick11w'

In [192]:
#Identify makeup categories
category = r"()"
[re.findall(category, row) for row in df]

[[u'stix @', u'paint @'],
 [u'foundation @', u'waterline @'],
 [u'sand @',
  u'powder @',
  u'highlight @',
  u'pencil @',
  u'stix @',
  u'earrings @'],
 [u'etcetera @'],
 [u'foundation @', u'palette @', u'honey @', u'liner @'],
 [u'eyes :', u'face :@', u'lips :@']]

In [None]:
#Identify continuous strings
continuous = r"\w+bronzer\s[@:]|\w+foundation\s[@:]|\w+lips\s[@:]|\w+lip\s[@:]|\w+lipgloss\s[@:]|\w+concealer\s[@:]|\w+blush\s[@:]|\w+corrector\s[@:]|\w+lipliner\s[@:]|\w+eye\s[@:]|\w+lipstick\s[@:]|\w+contour\s[@:]|\w+palette\s[@:]|\w+eyeshadow\s[@:]|\w+eyeliner\s[@:]|\w+brows\s[@:]|\w+highlight\s[@:]|\w+gloss\s[@:]|\w+stick\s[@:]|\w+glow\s[@:]"
continuous = [re.findall(continuous, row) for row in df]
continuous

In [None]:
#Partition continuous strings
categories = [
    "foundation", "bronzer", "lips", "lip", "lipliner", "lipliners", "liner", "lipstick", "lipgloss", 
    "eyes", "eyeshadow", "eyebrows", "eyeliner", "brows", "concealer", "blush", "contour", 
    "corrector", "palette", "highlight", "highlighter", "gloss", "stick", "glow"]

df = [re.sub(r"({seps})".format(seps='|'.join(categories)), r' \1', i) for i in df]

# Practice on dataset (scraped data from one instagram account)

In [172]:
google_sheet_url_pao = 'https://docs.google.com/spreadsheets/d/1AuyA_Q_12EOT8gJ7S2Lm4_j-JBPESeA8NQCuLkkjsmw/export?format=csv&gid=1775475691'
pao_df = pd.read_csv(google_sheet_url_pao)
pao_df['Description'] = pao_df['Description'].str.decode('utf-8')
pao_df['Description'] = pao_df['Description'].str.lower()
pao_df = pao_df['Description']
pao_df

0      paox33wet skin üí¶‚ú®inspired by @amandakhamkaew ...
1      paox33for a light or full coverage look @estee...
2      paox33easy fall makeup‚ú® only using 2 shadows Ì†æ...
3      paox33this is what using an eyeshadow as a hig...
4      paox33today's makeup ‚ùÑÔ∏èleave me a "üòà"if you w...
5      paox33matte eyes and glossy lips üòã featuring ...
6      paox33no foundation makeup looküí•thank you so ...
7      paox33loving the new @maybelline fit me loose ...
8      paox33car selfies üëÄeyes : @urbandecaycosmetic...
9      paox33back at it with them makeup videos üò¨üíÉÌ†º...
10     paox33trying out new skincare products ‚ú®üí¶face...
11     paox33my current go to look üé•üî•eyeshadow prim...
12     paox33sending you guys lots of love and positi...
13     paox33my first #halloween makeup tutorialüëª wh...
14     paox33glossy ‚ú®üçë eyes @colourpopcosmetics #col...
15     paox33started using @frankierosecosmetics prod...
16     paox33that island glow üíï üå¥¬†the new

In [111]:
#Remove emojis
emoji = re.compile(
    u"(\ud83d[\ude00-\ude4f])|"  # emoticons
    u"(\ud83c[\udf00-\uffff])|"  # symbols & pictographs (1 of 2)
    u"(\ud83d[\u0000-\uddff])|"  # symbols & pictographs (2 of 2)
    u"(\ud83d[\ude80-\udeff])|"  # transport & map symbols
    u"(\ud83c[\udde0-\uddff])"  # flags (iOS)
    "+", flags=re.UNICODE)

pao_df = [re.sub(emoji, " ", row) for row in pao_df]
pao_df

[u'paox33wet skin  \u2728inspired by @amandakhamkaew  \u2014\u2014\u2014\u2014\u2014\u2014brows @milkmakeup kush fiber brow in dutch #milkmakeup lips @glossier lip gloss in clear transparent #glossier concealer @milanicosmetics #milanicosmetics conceal + perfect in medium beige highlighter @hudabeauty #hudabeautynymph in luna topped off with @pixibeauty subtle sunrise #pixibeauty blush @toofaced fruit cocktail blush duo in papaya pop #toofaced luminous dewy mist @tatcha #tatcha #paox3340w',
 u'paox33for a light or full coverage look @esteelauder #doublewear foundation is on top of the list  . it has medium-to-buildable coverage and comes in both liquid and powder so it\u2019s perfect for all skin types.i like to apply double wear matte powder on top of the liquid, or wear alone for a natural finish. #esteepartner53w',
 u'paox33easy fall makeup\u2728 only using 2 shadows \U0001f92d & 2 lipstick choices \U0001f917 which one was your fave ?face mist @bliss rose gold rescue #blissface oil 

In [112]:
#Identify makeup categories
makeup = r"\w+\s+?[@:-]"
[re.findall(makeup, row) for row in pao_df]

[[u'by @',
  u'brows @',
  u'lips @',
  u'concealer @',
  u'highlighter @',
  u'with @',
  u'blush @',
  u'mist @'],
 [u'look @'],
 [u'mist @',
  u'oil @',
  u'gel @',
  u'pencil @',
  u'fentybeautyconcealer @',
  u'palette @',
  u'and @',
  u'powder @',
  u'contour @',
  u'x @',
  u'blush @',
  u'highlighter @',
  u'liner @',
  u'vamplipstick @',
  u'combo @',
  u'x  @'],
 [u'palette :',
  u'lashes :',
  u'spray :',
  u'foundation :',
  u'a @',
  u'powder :',
  u'highlighter :',
  u'208bronzer :',
  u'lips :',
  u'shophudabeauty @'],
 [u'foundation :',
  u'eyes :',
  u'lashes :',
  u'bronzer :',
  u'lips :',
  u'with @',
  u'corner :'],
 [u'eyes :',
  u'lashes :',
  u'mascara :',
  u'lips :',
  u'with @',
  u'jadefoundation :',
  u'brows :',
  u'04bronzer :',
  u'highlighter :'],
 [u'moisturizer @',
  u'cream @',
  u'sponge @',
  u'medium @',
  u'bronzer @',
  u'highlighter @',
  u'lashes @',
  u'x @',
  u'eyeshadow @',
  u'eyeliner @',
  u'lip @',
  u'cream  @',
  u'from @'],
 [u'new

In [115]:
#Identify continuous strings
continuous = r"\w+bronzer\s[@:]|\w+foundation\s[@:]|\w+lips\s[@:]|\w+lip\s[@:]|\w+lipgloss\s[@:]|\w+concealer\s[@:]|\w+blush\s[@:]|\w+corrector\s[@:]|\w+lipliner\s[@:]|\w+eye\s[@:]|\w+lipstick\s[@:]|\w+contour\s[@:]|\w+palette\s[@:]|\w+eyeshadow\s[@:]|\w+eyeliner\s[@:]|\w+brows\s[@:]|\w+highlight\s[@:]|\w+gloss\s[@:]|\w+stick\s[@:]|\w+glow\s[@:]"
continuous = [re.findall(continuous, row) for row in pao_df]
continuous

[[],
 [],
 [u'fentybeautyconcealer @', u'vamplipstick @'],
 [u'208bronzer :'],
 [],
 [u'jadefoundation :', u'04bronzer :'],
 [],
 [],
 [],
 [],
 [],
 [u'brushhighlight :'],
 [u'9bronzer @'],
 [],
 [],
 [],
 [],
 [u'eyebrows @', u'naturalleconcealer @', u'lipgloss @'],
 [],
 [],
 [],
 [u'elfcosmeticsfoundation @', u'eyeseyeliner @'],
 [u'eyebrows @'],
 [u'eyebrows :'],
 [],
 [u'lipstick @'],
 [],
 [u'8concealer @', u'lipstick @'],
 [],
 [],
 [],
 [u'eyebrows @'],
 [u'milanicosmeticsblush @', u'duartbeautylips @'],
 [u'itsmyrayerayefoundation @', u'460concealer @'],
 [u'4contour @'],
 [],
 [],
 [u'eyebrows @'],
 [u'urbandecaycosmeticscontour :', u'lipstick :', u'lipgloss :'],
 [],
 [u'benefitbrowsbrows :', u'lipstick :', u'goldenconcealer :'],
 [u'melangebrows :'],
 [u'toofacedblush @', u'nabelalips @'],
 [u'eyebrows @'],
 [u'eyebrows :'],
 [u'lipstick @'],
 [],
 [],
 [],
 [],
 [u'902foundation @'],
 [u'lancomeofficialfoundation @', u'3cbronzer @'],
 [],
 [u'tina_yongbronzer @'],
 [u'per

In [116]:
#Partition continuous strings
categories = [
    "foundation", "bronzer", "lips", "lip", "lipliner", "lipliners", "liner", "lipstick", "lipgloss", 
    "eyes", "eyeshadow", "eyebrows", "eyeliner", "brows", "concealer", "blush", "contour", 
    "corrector", "palette", "highlight", "highlighter", "gloss", "stick", "glow"]

pao_df = [re.sub(r"({seps})".format(seps='|'.join(categories)), r' \1', i) for i in pao_df]

#Approach B
#pao_df = [re.sub(r'(?=(?:foundation|bronzer|lips|lip|lipliner|liner|lipstick|lipgloss|eyes|eyeshadow|eyebrows|eyeliner|brows|concealer|blush|contour|corrector|palette|highlight|highlighter|gloss|stick|glow) )', " ", i) for i in pao_df]
#pao_df = [re.sub(r'(?=(?:foundation|bronzer|lips|lip|lipliner|liner|lipstick|lipgloss|eyes|eyeshadow|eyebrows|eyeliner|brows|concealer|blush|contour|corrector|palette|highlight|highlighter|gloss|stick|glow))', " ", i) for i in pao_df]
#pao_df

In [117]:
#Trim trailing whitespace after category mention, before colon (i.e. "foundation :" to 'foundation:')
pao_df = [re.sub(r"(\s:\s)", ": ", i) for i in pao_df]

#Add space between colon and @ in category mentions (i.e. 'foundation:@' to 'foundation: @')
pao_df = [re.sub(r"(\w+\:@)", "\w+\: @", i) for i in pao_df]
pao_df

[u'paox33wet skin  \u2728inspired by @amandakhamkaew  \u2014\u2014\u2014\u2014\u2014\u2014 brows @milkmakeup kush fiber brow in dutch #milkmakeup  lips @ glossier  lip  gloss in clear transparent # glossier  concealer @milanicosmetics #milanicosmetics conceal + perfect in medium beige  highlighter @hudabeauty #hudabeautynymph in luna topped off with @pixibeauty subtle sunrise #pixibeauty  blush @toofaced fruit cocktail  blush duo in papaya pop #toofaced luminous dewy mist @tatcha #tatcha #paox3340w',
 u'paox33for a light or full coverage look @esteelauder #doublewear  foundation is on top of the list  . it has medium-to-buildable coverage and comes in both liquid and powder so it\u2019s perfect for all skin types.i like to apply double wear matte powder on top of the liquid, or wear alone for a natural finish. #esteepartner53w',
 u'paox33easy fall makeup\u2728 only using 2 shadows \U0001f92d & 2  lipstick choices \U0001f917 which one was your fave ?face mist @bliss rose gold rescue #bl

In [118]:
#Encode dataset (result still has \xe2...)
pao_df = [i.encode('utf-8') for i in pao_df]
pao_df

['paox33wet skin  \xe2\x9c\xa8inspired by @amandakhamkaew  \xe2\x80\x94\xe2\x80\x94\xe2\x80\x94\xe2\x80\x94\xe2\x80\x94\xe2\x80\x94 brows @milkmakeup kush fiber brow in dutch #milkmakeup  lips @ glossier  lip  gloss in clear transparent # glossier  concealer @milanicosmetics #milanicosmetics conceal + perfect in medium beige  highlighter @hudabeauty #hudabeautynymph in luna topped off with @pixibeauty subtle sunrise #pixibeauty  blush @toofaced fruit cocktail  blush duo in papaya pop #toofaced luminous dewy mist @tatcha #tatcha #paox3340w',
 'paox33for a light or full coverage look @esteelauder #doublewear  foundation is on top of the list  . it has medium-to-buildable coverage and comes in both liquid and powder so it\xe2\x80\x99s perfect for all skin types.i like to apply double wear matte powder on top of the liquid, or wear alone for a natural finish. #esteepartner53w',
 'paox33easy fall makeup\xe2\x9c\xa8 only using 2 shadows \xf0\x9f\xa4\xad & 2  lipstick choices \xf0\x9f\xa4\x97

In [119]:
# Create regex patterns for each category, brand, product, shade
# Note sometimes product is mentioned, sometimes not

category = r"(\w+\?:s\[@:-]+\s)" #patterns ["eyes:", "eyes : ", "eyes @", "eyes-"]
brand = r"(\@\w+\s)" #pattern ["@urbandecaycosmetics "] #also comes after category
product = r"(\w+(?=\@\w+\s))" #typically text that comes after brand mention and before shade
shade = r"()" #patterns ["'gold'", "shade 8&9", "in number 4", ""] #typically mentioned after product and before category

# Compile regex and tokenize
[regexp_tokenize(i, r"(\w+\?:s\[@:]+\s\.+?\.)") for i in pao_df]


[[],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],


In [33]:
#If sent_tokenize doesn't work, add periods

#pao_df = [re.sub(r"(?=\s+\w+\s+@)", ".", i) for i in pao_df]
#pao_df = [re.sub(r"(?=\s+\w+\:)", ".", i) for i in pao_df]
#pao_df

#Compile regex to find makeup categories
[re.findall(r"(\w+\?:s\[@:]+\s\@\w+\s.+?\?:w+@)", i) for i in pao_df]

[[],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],


In [500]:
r3 = [re.findall(r'(?P<category>\w+\:+)\s+(?P<brand>\@\w+)\s[^."]*(?:"(?P<shade>.+?)")?[^."]*(?:\.\s+|$)', i) for i in pao_df]

#Convert matches into dictionary
for m in r3.finditer(pao_df):
    print m.groupdict()
    
#Convert dictionary to dataframe
df = pd.DataFrame([m.group('category', 'brand', 'shade') for m in r3.finditer(i) for i in pao_df], columns = ['Category', 'Brand', 'Shade'])
df['Category'] = df['Category'].str.replace(r"\:", "")
df['Brand'] = df['Brand'].str.replace(r"\@", "")
df

AttributeError: 'list' object has no attribute 'finditer'