In [1]:
import pandas as pd
import re

import string
import spacy
from spacy import displacy

In [2]:
NLP = spacy.load("en_core_web_sm")

## Nike

In [30]:
nike_data = pd.read_csv('nike/nike_data.csv', index_col=0)
# nike_data.dropna(inplace = True)
# nike_data.reset_index(drop=True, inplace=True)

In [31]:
nike_data.iloc[1]['caption']

'Meet @azusa25nigo, the founder of @skate_girls_snap. 🛹\r\n\r\nSkate Girls Snap is a project that celebrates the diversity within Tokyo’s growing women’s skate scene. Through this project, Azusa is creating a more inclusive skate culture and inspiring the \r\nnext generation of female skaters to be themselves.\r\n\r\nLearn more about Skate Girls Snap and Azusa’s skate crew. Link in bio.\r\n\r\n🎥: @nobu_arakawa @callumhasegawa'

In [32]:
nike_data.isna().sum()

img_url         0
caption         1
n_likes_1000    0
n_comments      0
age             0
dtype: int64

In [34]:
nike_data.loc[nike_data['caption'].isna(), 'caption'] = ''
nike_data.isna().sum()

img_url         0
caption         0
n_likes_1000    0
n_comments      0
age             0
dtype: int64

In [35]:
nike_data.shape

(612, 5)

In [36]:
nike_data.columns

Index(['img_url', 'caption', 'n_likes_1000', 'n_comments', 'age'], dtype='object')

In [37]:
nike_data.head()

Unnamed: 0,img_url,caption,n_likes_1000,n_comments,age
0,https://instagram.flwo4-2.fna.fbcdn.net/v/t51....,“100% of myself is nothing compared to 1% of t...,290k,28175,5 days ago
1,https://instagram.flwo4-2.fna.fbcdn.net/v/t51....,"Meet @azusa25nigo, the founder of @skate_girls...",88k,66716,6 days ago
2,https://instagram.flwo4-1.fna.fbcdn.net/v/t51....,It takes courage to take the first step 🏃. Jus...,243k,46306,6 days ago
3,https://instagram.flwo4-2.fna.fbcdn.net/v/t51....,“The climate crisis is affecting my sport and ...,159k,87011,1 week ago
4,https://instagram.flwo4-2.fna.fbcdn.net/v/t51....,“People like to tell us what we can and can’t ...,252k,67646,1 week ago


In [38]:
def wrangle(text):
    text = text.replace('“', '"').replace('”','"').replace('’', "'")
    printable = set(string.printable)
    text = ''.join(filter(lambda x: x in printable, text))
    return text

In [39]:
def firstLineQuote(text):
    
    pattern_qp = r'\"(.*?)\"[@\-\w\s]+'
    pattern_q = r'\"(.*?)\"'
    r_qp = re.fullmatch(pattern_qp,text)
    r_q = re.fullmatch(pattern_q, text)
    if r_qp != None:
        return 'personal_quote'
    if r_q != None:
        return 'quote'
    return ''

In [40]:
def generateNER(text):
    doc = NLP(text)
    
    ners = []
    for word in doc.ents:
        if word.label_.lower() in ['event', 'fac', 'gpe', 'law', 'loc', 'money', 'norp', 'org', 'person', 'product', 'work_of_art']:
            ners.append(word.label_.lower())
    
    return ','.join(ners)
    

In [41]:
def analyzeText(text):
    
    print("----------- Actual Text ------------")
    print(text)
    doc = NLP(text)
    
#     print("----------- Spacy render -----------")
#     displacy.render(doc,style="ent",jupyter=True)
    
#     print("----------- Spacy NERs -------------")
#     for word in doc.ents:
#         print(word.text,word.label_)
    
#     print("----------- POS tag ----------------")
#     # Token and Tag
#     for token in doc:
#         print(token, token.pos_)        

In [42]:
nike_data['caption_cleaned'] = nike_data['caption'].apply(lambda x : wrangle(x))

In [43]:
nike_data['NER'] = nike_data['caption_cleaned'].apply(lambda x : generateNER(x))

In [44]:
nike_data['firstLineQuote'] = nike_data['caption_cleaned'].apply(lambda x : firstLineQuote(x.strip().split('\n')[0].strip())) 

In [45]:
nike_data.head()

Unnamed: 0,img_url,caption,n_likes_1000,n_comments,age,caption_cleaned,NER,firstLineQuote
0,https://instagram.flwo4-2.fna.fbcdn.net/v/t51....,“100% of myself is nothing compared to 1% of t...,290k,28175,5 days ago,"""100% of myself is nothing compared to 1% of t...",person,personal_quote
1,https://instagram.flwo4-2.fna.fbcdn.net/v/t51....,"Meet @azusa25nigo, the founder of @skate_girls...",88k,66716,6 days ago,"Meet @azusa25nigo, the founder of @skate_girls...","person,org,gpe,org,norp,work_of_art,org",
2,https://instagram.flwo4-1.fna.fbcdn.net/v/t51....,It takes courage to take the first step 🏃. Jus...,243k,46306,6 days ago,It takes courage to take the first step . Just...,"org,gpe,org,gpe,norp,work_of_art,org",
3,https://instagram.flwo4-2.fna.fbcdn.net/v/t51....,“The climate crisis is affecting my sport and ...,159k,87011,1 week ago,"""The climate crisis is affecting my sport and ...",person,quote
4,https://instagram.flwo4-2.fna.fbcdn.net/v/t51....,“People like to tell us what we can and can’t ...,252k,67646,1 week ago,"""People like to tell us what we can and can't ...","org,org,org",personal_quote


In [46]:
from sklearn.feature_extraction.text import CountVectorizer

# get tf dataframe
def tf(col):
    # creating tf-idf vector
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(col.values)
    columns = vectorizer.get_feature_names()

    # creating tf idf df
    tf_idf_df = pd.DataFrame(X.toarray(), columns=columns)
    
    return tf_idf_df

In [47]:
nike_ner_quote = pd.concat([nike_data[['caption', 'n_likes_1000', 'n_comments']], 
                            tf(nike_data['NER']), tf(nike_data['firstLineQuote'])], axis=1)
nike_ner_quote

Unnamed: 0,caption,n_likes_1000,n_comments,event,fac,gpe,law,loc,money,norp,org,person,product,work_of_art,personal_quote,quote
0,“100% of myself is nothing compared to 1% of t...,290k,28175,0,0,0,0,0,0,0,0,1,0,0,1,0
1,"Meet @azusa25nigo, the founder of @skate_girls...",88k,66716,0,0,1,0,0,0,1,3,1,0,1,0,0
2,It takes courage to take the first step 🏃. Jus...,243k,46306,0,0,2,0,0,0,1,3,0,0,1,0,0
3,“The climate crisis is affecting my sport and ...,159k,87011,0,0,0,0,0,0,0,0,1,0,0,0,1
4,“People like to tell us what we can and can’t ...,252k,67646,0,0,0,0,0,0,0,3,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
607,Run Dem Crew founder @daddydark rallys the tro...,6k,16,0,0,1,0,0,0,1,0,0,0,0,0,0
608,20 minutes of balling earned us 449 NikeFuel. ...,10k,81,0,0,0,0,0,2,0,0,0,1,0,0,0
609,10 explosive minutes from @ShawnJohnsons Nike+...,13k,125,0,0,0,0,0,1,0,1,1,0,0,0,0
610,We crashed a NTC class with @ShawnJohnson at t...,9k,48,0,0,0,0,0,0,0,2,1,0,0,0,0


In [48]:
nike_ner_quote.to_csv('Nike/nike_ner_quote.csv', index=False)

## Adidas

In [51]:
adidas_data = pd.read_csv('adidas/adidas_data.csv', index_col=0)
adidas_data.dropna(inplace = True)
adidas_data.reset_index(drop=True, inplace=True)

In [52]:
adidas_data.iloc[1]['caption']

'Mother. Daughter. Hooper. Leader.\u200b\r\n\r\nTo some she’s Candace, to others she’s Ace. To all, she’s an inspiration.\u200b\r\n\r\nFrom prodigy to icon, @candaceparker has always worked to redefine what’s possible.\u200b\r\n\r\n#ImpossibleIsNothing'

In [53]:
adidas_data.shape

(610, 5)

In [56]:
adidas_data.isna().sum()

img_url         0
caption         0
n_likes_1000    0
n_comments      0
age             0
dtype: int64

In [57]:
adidas_data.columns

Index(['img_url', 'caption', 'n_likes_1000', 'n_comments', 'age'], dtype='object')

In [58]:
adidas_data.head()

Unnamed: 0,img_url,caption,n_likes_1000,n_comments,age
0,https://instagram.flwo4-2.fna.fbcdn.net/v/t51....,As South Africa’s first Black rugby captain @s...,22k,356,1 week ago
1,https://instagram.flwo4-2.fna.fbcdn.net/v/t51....,Mother. Daughter. Hooper. Leader.​\r\n\r\nTo s...,20k,530,2 weeks ago
2,https://instagram.flwo4-2.fna.fbcdn.net/v/t51....,"Records are made to be broken.\r\n\r\nToday, A...",37k,179,3 weeks ago
3,https://instagram.flwo4-2.fna.fbcdn.net/v/t51....,Sport is for everyone.,144k,746,4 weeks ago
4,https://instagram.flwo4-2.fna.fbcdn.net/v/t51....,"&quot;She never saw what she could not, only w...",26k,17471,1 month ago


In [59]:
adidas_data['caption_cleaned'] = adidas_data['caption'].apply(lambda x : wrangle(x))

In [60]:
adidas_data['NER'] = adidas_data['caption_cleaned'].apply(lambda x : generateNER(x))

In [61]:
adidas_data['firstLineQuote'] = adidas_data['caption_cleaned'].apply(lambda x : firstLineQuote(x.strip().split('\n')[0].strip())) 

In [62]:
adidas_ner_quote = pd.concat([adidas_data[['caption', 'n_likes_1000', 'n_comments']], 
                            tf(adidas_data['NER']), tf(adidas_data['firstLineQuote'])], axis=1)
adidas_ner_quote

Unnamed: 0,caption,n_likes_1000,n_comments,event,fac,gpe,law,loc,money,norp,org,person,product,work_of_art,personal_quote,quote
0,As South Africa’s first Black rugby captain @s...,22k,356,0,0,1,0,0,0,0,0,1,0,0,0,0
1,Mother. Daughter. Hooper. Leader.​\r\n\r\nTo s...,20k,530,0,0,0,0,0,0,0,2,0,0,0,0,0
2,"Records are made to be broken.\r\n\r\nToday, A...",37k,179,0,0,2,0,0,1,0,2,4,0,0,0,0
3,Sport is for everyone.,144k,746,0,0,0,0,0,0,0,0,0,0,0,0,0
4,"&quot;She never saw what she could not, only w...",26k,17471,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
605,"Donated by proud LBP alum @SnoopDogg, check ou...",50k,206,0,0,0,0,0,0,0,3,0,0,0,0,0
606,#Primeknit Pure #BOOST: extreme energy meets a...,60k,598,0,0,0,0,0,0,0,1,0,0,0,0,0
607,"Predator is the past, present and future. Inst...",64k,221,0,0,0,0,0,0,0,1,0,0,0,0,0
608,Simply #unstoppable: the @adidasOriginals by @...,54k,460,0,0,1,0,0,0,0,1,0,0,0,0,0


In [63]:
adidas_ner_quote.to_csv('Adidas/adidas_ner_quote.csv', index=False)