In [1]:
import pandas as pd
import numpy as np
import string
!pip install nltk
import nltk
nltk.download('punkt')
from nltk import word_tokenize, Counter
from nltk.corpus import stopwords
import itertools
import spacy
nlp = spacy.load('en_core_web_sm')
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer



[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\gupta\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### Objective - Use the raw captions from scrapped data and convert it into a list of words and tf/idf


In [2]:
# function to lemmatize all words in captions
def lemmatization(text):
    text = nlp(text)
    text_lemma = [word.lemma_ for word in text]
    return " ".join(text_lemma)

In [3]:
# function to clean caption obtained from scrapper
def wrangle(text):
    text = text.replace('“', '"').replace('”','"').replace('’', "'")
    printable = set(string.printable)
    text = ''.join(filter(lambda x: x in printable, text))
   
    return text

In [4]:
# function to remove stop words and punctuations from the list of caption words
def remove_stopwords(lst):
    return [word for word in lst if 
            ( (word not in stopwords.words()) &
            (word not in list(string.punctuation)) &
            (word not in list(string.digits)))]

In [5]:
# applying the cleaning functions
def caption_cleaning(data):
    # subsetting captions only
    captions = data[['caption']]
    captions['caption'] = captions['caption'].astype(str).str.strip()
    captions['caption'] = captions['caption'].map(lambda s: wrangle(s))
    captions['caption_lemma'] = captions['caption'].map(lemmatization)

    # creating caption list
    captions['caption_list'] = captions['caption_lemma'].map(
        lambda row: word_tokenize(row.lower()))
    
    # removing stop words and punctuation
    captions['caption_list'] = captions['caption_list'].map(lambda row: remove_stopwords(row))
    
    return captions

In [6]:
# get tf dataframe
def tf(col):
    # creating tf-idf vector
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(col.values)
    columns = vectorizer.get_feature_names()

    # creating tf idf df
    tf_idf_df = pd.DataFrame(X.toarray(), columns=columns)
    
    return tf_idf_df

In [7]:
# get tf idf dataframe
def tf_idf(col):
    # creating tf-idf vector
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(col.values)
    columns = vectorizer.get_feature_names()

    # creating tf idf df
    tf_idf_df = pd.DataFrame(X.toarray(), columns=columns)
    
    return tf_idf_df

In [8]:
# creating tf-idf vectors
def caption_tf_idf(captions, idf=True):
    # creating a list of caption words
    caption_words_list = []
    for l in list(captions['caption_list'].values):
        caption_words_list = caption_words_list + l

    # removing duplicates 
    caption_words_list = list(set(caption_words_list))

    # removing words with length <= 2
    caption_words_list = [w for w in caption_words_list if len(w) > 2]

    # converting the text to list
    captions['caption_cleaned'] = captions['caption_list'].map(lambda lst: ' '.join(lst))
    
    # get tf idf vec
    if idf:
        caption_tf_idf = tf_idf(captions['caption_cleaned'])
    else:
        caption_tf_idf = tf(captions['caption_cleaned'])
    caption_words_list = list(set(caption_words_list).intersection(set(caption_tf_idf.columns)))
    caption_tf_idf = caption_tf_idf[caption_words_list]
    
    return caption_tf_idf

In [9]:
# applying the cleaning functions
def label_cleaning(data):
    # subsetting labels only
    labels = data[['labels']]
    labels['labels'] = labels['labels'].astype(str).str.strip()
    labels['labels'] = labels['labels'].map(lambda s: wrangle(s))
    labels['labels_lemma'] = labels['labels'].map(lemmatization)

    # creating labels list
    labels['labels_list'] = labels['labels_lemma'].map(
        lambda row: word_tokenize(row.lower()))
    
    # removing stop words and punctuation: NOT NEEDED FOR LABELS
    # labels['labels_list'] = labels['labels_list'].map(lambda row: remove_stopwords(row))
    
    return labels

In [10]:
# creating tf-idf vectors
def labels_tf_idf(labels, idf=True):
    # creating a list of labels words
    labels_words_list = []
    for l in list(labels['labels_list'].values):
        labels_words_list = labels_words_list + l

    # removing duplicates 
    labels_words_list = list(set(labels_words_list))

    # removing words with length <= 2
    labels_words_list = [w for w in labels_words_list if len(w) > 2]

    # converting the text to list
    labels['labels_cleaned'] = labels['labels_list'].map(lambda lst: ' '.join(lst))
    
    # get tf idf vec
    if idf:
        labels_tf_idf = tf_idf(labels['labels_cleaned'])
    else:
        labels_tf_idf = tf(labels['labels_cleaned'])
    labels_words_list = list(set(labels_words_list).intersection(set(labels_tf_idf.columns)))
    labels_tf_idf = labels_tf_idf[labels_words_list]
    
    return labels_tf_idf

In [11]:
# creating tf-idf vectors
def img_cap_tf_idf(img_cap, idf=True):
    # creating a list of img_cap words
    words_list = []
    for l in list(img_cap['img_cap_list'].values):
        words_list = words_list + l

    # removing duplicates 
    words_list = list(set(words_list))

    # removing words with length <= 2
    words_list = [w for w in words_list if len(w) > 2]

    # converting the text to list
    img_cap['cleaned'] = img_cap['img_cap_list'].map(lambda lst: ' '.join(lst))
    
    # get tf idf vec
    if idf:
        img_cap_tf_idf = tf_idf(img_cap['cleaned'])
    else:
        img_cap_tf_idf = tf(img_cap['cleaned'])
    words_list = list(set(words_list).intersection(set(img_cap_tf_idf.columns)))
    img_cap_tf_idf = img_cap_tf_idf[words_list]
    
    return img_cap_tf_idf

## Caption

In [12]:
# reading the data
data = pd.read_csv('Nike/nike_data.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,img_url,caption,n_likes_1000,n_comments,age
0,0,https://instagram.flwo4-2.fna.fbcdn.net/v/t51....,“100% of myself is nothing compared to 1% of t...,290k,28175,5 days ago
1,1,https://instagram.flwo4-2.fna.fbcdn.net/v/t51....,"Meet @azusa25nigo, the founder of @skate_girls...",88k,66716,6 days ago
2,2,https://instagram.flwo4-1.fna.fbcdn.net/v/t51....,It takes courage to take the first step 🏃. Jus...,243k,46306,6 days ago
3,3,https://instagram.flwo4-2.fna.fbcdn.net/v/t51....,“The climate crisis is affecting my sport and ...,159k,87011,1 week ago
4,4,https://instagram.flwo4-2.fna.fbcdn.net/v/t51....,“People like to tell us what we can and can’t ...,252k,67646,1 week ago


In [14]:
data.shape

(612, 6)

In [None]:
captions = caption_cleaning(data)

In [None]:
captions

In [None]:
captions_tf_idf_df = caption_tf_idf(captions)
captions_tf_idf_df['caption'] = captions['caption']
captions_tf_idf_df['caption_list'] = captions['caption_list']
captions_tf_idf_df.to_csv('nike_caption_tf_idf.csv')

In [None]:
captions_tf_df = caption_tf_idf(captions, idf=False)
captions_tf_df['caption'] = captions['caption']
captions_tf_df['caption_list'] = captions['caption_list']
captions_tf_df.to_csv('nike_caption_tf.csv')

## Image labels

In [13]:
labels = pd.read_excel('Nike/NikeLabels.xlsx')
labels.drop(0, inplace=True, axis=0)
labels.reset_index(drop=True, inplace=True)
labels.columns = [s.lower() for s in labels.columns]
labels

Unnamed: 0,labels,url,anger,joy,surprise
0,Furniture Picture frame Beard Standing Drawer ...,https://instagram.flwo4-2.fna.fbcdn.net/v/t51....,VERY_UNLIKELY,VERY_UNLIKELY,VERY_UNLIKELY
1,Sleeve Hat Music Workwear Cool Entertainment F...,https://instagram.flwo4-2.fna.fbcdn.net/v/t51....,VERY_UNLIKELY,UNLIKELY,VERY_UNLIKELY
2,Footwear Jeans Shoe Wheel Sports equipment Ska...,https://instagram.flwo4-1.fna.fbcdn.net/v/t51....,VERY_UNLIKELY,VERY_UNLIKELY,VERY_UNLIKELY
3,Joint Skin Shoe Arm Leg Shorts Purple Knee Com...,https://instagram.flwo4-2.fna.fbcdn.net/v/t51....,VERY_UNLIKELY,VERY_LIKELY,VERY_UNLIKELY
4,Skin Lip Shoulder White Eyelash Organ Lingerie...,https://instagram.flwo4-2.fna.fbcdn.net/v/t51....,VERY_UNLIKELY,VERY_UNLIKELY,VERY_UNLIKELY
...,...,...,...,...,...
607,Trousers Shirt Fashion Flash photography Perfo...,https://scontent-hel3-1.cdninstagram.com/v/t51...,VERY_UNLIKELY,POSSIBLE,VERY_UNLIKELY
608,Shoe Shorts Sneakers Flooring Floor Player Per...,https://scontent-hel3-1.cdninstagram.com/v/t51...,,,
609,Footwear Shorts Shoe Arm yoga pant Active pant...,https://scontent-hel3-1.cdninstagram.com/v/t51...,VERY_UNLIKELY,VERY_UNLIKELY,VERY_UNLIKELY
610,Shorts Dance Entertainment Active pants Perfor...,https://scontent-hel3-1.cdninstagram.com/v/t51...,VERY_UNLIKELY,VERY_UNLIKELY,VERY_UNLIKELY


In [None]:
image_labels = label_cleaning(labels)
image_labels.head()

In [None]:
image_tf_idf_df = labels_tf_idf(image_labels)
image_tf_idf_df['labels'] = image_labels['labels']
image_tf_idf_df['labels_list'] = image_labels['labels_list']
print(image_tf_idf_df.head())
image_tf_idf_df.to_csv('nike_label_tf_idf.csv')

In [None]:
image_tf_df = labels_tf_idf(image_labels, idf=False)
image_tf_df['labels'] = image_labels['labels']
image_tf_df['labels_list'] = image_labels['labels_list']
print(image_tf_df.head())
image_tf_df.to_csv('nike_label_tf.csv')

## Image + Caption 

In [None]:
image_caption = pd.concat([image_labels[['labels', 'labels_list']], captions[['caption', 'caption_list']]], axis=1)
image_caption['img_cap_list'] = image_caption.apply(lambda row: row['labels_list'] + row['caption_list'], axis=1)
image_caption.head()

In [None]:
# check first entry
assert len(image_caption.iloc[0, 1]) + len(image_caption.iloc[0, 3]) == len(image_caption.iloc[0, 4])

In [None]:
img_cap_tf_idf_df = img_cap_tf_idf(image_caption)
img_cap_tf_idf_df['img_cap_list'] = image_caption['img_cap_list']
print(img_cap_tf_idf_df.head())
img_cap_tf_idf_df.to_csv('nike_img_cap_tf_idf.csv')

In [None]:
img_cap_tf_df = img_cap_tf_idf(image_caption, idf=False)
img_cap_tf_df['img_cap_list'] = image_caption['img_cap_list']
print(img_cap_tf_df.head())
img_cap_tf_df.to_csv('nike_img_cap_tf.csv')

In [None]:
assert img_cap_tf_df.sum(axis=1).sum() - captions_tf_df.sum(axis=1).sum() - image_tf_df.sum(axis=1).sum() <= 10