In [3]:
from datasets import load_dataset, Image
import pandas as pd
import numpy as np
import spacy
from spacy.language import Language
from spacy.tokens import Doc
from spacy.tokens import Token
from spacy.lang.en import stop_words
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import spacy_fastlang
from transformers import BertTokenizer, BertModel
import torch
import torchvision.models as models
from torchvision.transforms import Resize
from torchvision.transforms import Normalize
from torchvision.transforms import ToTensor
from torch.autograd import Variable
from PIL import Image
from io import BytesIO
import requests
from sklearn.feature_extraction.text import CountVectorizer
import pickle

In [4]:
posts = load_dataset("p1atdev/pinterest", split='train', )

In [5]:
posts_df = posts.to_pandas()
posts_df

Unnamed: 0,tags,url,src,alt
0,"[Character Design Animation, Character Design ...",https://www.pinterest.com/pin/12525705205055127/,https://i.pinimg.com/originals/4f/42/05/4f4205...,Here Comes The Grump - Concept Art
1,"[Little Nightmares Fanart, Dreams And Nightmar...",https://www.pinterest.com/pin/832814156105807021/,https://i.pinimg.com/originals/91/43/b9/9143b9...,Tarsier Studios Little Nightmares II Art Blast...
2,"[Cyberpunk City, Cyberpunk 2077, Cyberpunk Kun...",https://www.pinterest.com/pin/515099276148793993/,https://i.pinimg.com/originals/28/e1/24/28e124...,Showcase of Mind Blowing Concept Art of Futuri...
3,"[Fantasy Kunst, Fantasy City, Fantasy Places, ...",https://www.pinterest.com/pin/354728908162906711/,https://i.pinimg.com/originals/01/98/37/019837...,
4,"[Creature Concept Art, Creature Design, Creatu...",https://www.pinterest.com/pin/894738650947032543/,https://i.pinimg.com/originals/22/de/51/22de51...,Book of the Dead: Concept Art | Unity Blog
...,...,...,...,...
3570,"[Photo 3d, Image Beautiful, 3d Mesh, Art Vinta...",https://www.pinterest.com/pin/564638872046245267/,https://i.pinimg.com/originals/f7/44/8d/f7448d...,The Surreal Portraiture of Simple Objects
3571,"[Girls Cartoon Art, Cartoon Art Styles, Anime ...",https://www.pinterest.com/pin/706150416606406453/,https://i.pinimg.com/originals/2d/63/6b/2d636b...,Stephanie Priscilla on Twitter
3572,"[Japon Illustration, Cute Illustration, Websit...",https://www.pinterest.com/pin/37858453109156245/,https://i.pinimg.com/originals/d5/15/92/d51592...,Cat Tea by SeerLight on DeviantArt
3573,"[Cyberpunk Aesthetic, Arte Cyberpunk, Pixel Ar...",https://www.pinterest.com/pin/702491241888463819/,https://i.pinimg.com/originals/e5/b1/c0/e5b1c0...,Cyberpunk Pixel Scene Commission for TopHatCal...


In [6]:
posts_df.isna().sum()

tags    0
url     0
src     0
alt     0
dtype: int64

In [7]:
posts_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3575 entries, 0 to 3574
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   tags    3575 non-null   object
 1   url     3575 non-null   object
 2   src     3575 non-null   object
 3   alt     3575 non-null   object
dtypes: object(4)
memory usage: 111.8+ KB


In [8]:
posts_df.tags.apply(lambda x: len(x) == 0).sum()

15

## "Alt" feature

In [9]:
np.sum(posts_df.alt.str.len() == 0)

615

In [10]:
posts_df.alt.str.len().describe()

count    3575.000000
mean       41.160280
std        52.866757
min         0.000000
25%        10.000000
50%        28.000000
75%        56.000000
max       500.000000
Name: alt, dtype: float64

## TF-IDF

In [11]:
nlp_md = spacy.load('en_core_web_sm')

In [12]:
nlp = nlp_md

In [13]:
Token.set_extension('is_stopword', default=False, force=True)
Doc.set_extension('preprocessed_text', default='', force=True)

@Language.component("detect_stopwods")
def detect_stopwods(doc: Doc):
    for token in doc:
        if (token.text.lower() in stop_words.STOP_WORDS) or (not token.is_alpha):
            token._.is_stopword = True
    return doc

@Language.component("add_preprocessed_text")
def add_preprocessed_text(doc: Doc):
    preprocessed_tokens = []
    for token in doc:
        if not token._.is_stopword:
            preprocessed_tokens.append(token.lemma_.lower())
        doc._.preprocessed_text = " ".join(preprocessed_tokens)
    return doc


def create_pipeline(nlp):
    nlp.add_pipe('detect_stopwods', last=True)
    nlp.add_pipe('add_preprocessed_text', last=True)
    nlp.add_pipe("language_detector")
    return nlp


nlp = create_pipeline(nlp)


def preprocess_text(text: str):
    doc = nlp(text)
    return doc



In [14]:
posts_df['preprocessed_alt'] = posts_df.alt.apply(preprocess_text)
posts_df = posts_df[posts_df['preprocessed_alt'].apply(lambda x: x._.language == 'en')]

In [15]:
list_preprocessed_alt = posts_df['preprocessed_alt'].apply(lambda x: x._.preprocessed_text).to_list()

In [16]:
len(list_preprocessed_alt)

2825

In [17]:
alt_vectorizer = TfidfVectorizer(max_df=0.9, min_df=0.0005)
X = alt_vectorizer.fit_transform(list_preprocessed_alt)
posts_df['tfidf_descr_vector'] = list(X.toarray())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  posts_df['tfidf_descr_vector'] = list(X.toarray())


In [18]:
pickle.dump(alt_vectorizer, open('../Models/TfidfVectorizer.pkl', 'wb'))

## BERT

In [19]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

tokens = tokenizer(posts_df['alt'][0], padding=True, truncation=True, return_tensors="pt")
with torch.no_grad():
    outputs = model(**tokens)
vector = outputs.last_hidden_state.mean(dim=1).numpy()[0]

In [20]:
def get_vec_from_bert(text):
    
    tokens = tokenizer(text, padding=True, truncation=True, return_tensors="pt")
    
    with torch.no_grad():
        outputs = model(**tokens)
        
    return outputs.last_hidden_state.mean(dim=1).numpy()[0]

In [21]:
posts_df['bert_descr_vector'] = posts_df['alt'].apply(get_vec_from_bert)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  posts_df['bert_descr_vector'] = posts_df['alt'].apply(get_vec_from_bert)


In [22]:
type(posts_df['bert_descr_vector'][0])

numpy.ndarray

# Tags
## Bag Of Words

In [23]:
def preprocess_tokens(post_tags: list):
    all_tags = []
    
    for tags in post_tags:
        all_tags.append(tags)
        
    joined = " ".join(all_tags)
    
    preprocessed_tags = preprocess_text(joined)
    preprocessed_tags = " ".join(list(set((preprocessed_tags._.preprocessed_text).split(" "))))
    return preprocessed_tags
    
preprocessed_tags = posts_df['tags'].apply(preprocess_tokens)
posts_df['preprocessed_tags'] = preprocessed_tags

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  posts_df['preprocessed_tags'] = preprocessed_tags


In [24]:
count_vect = CountVectorizer()

vectorized_tags = count_vect.fit_transform(posts_df['preprocessed_tags'].to_list())

posts_df['tags_vector'] = list(vectorized_tags.toarray())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  posts_df['tags_vector'] = list(vectorized_tags.toarray())


In [25]:
pickle.dump(count_vect, open('../Models/CountVectorizer.pkl', 'wb'))

# Image Embedding

In [26]:
resnet = models.resnet18(pretrained=True)

scaler = Resize((224, 224))
normalize = Normalize(mean=[0.485, 0.456, 0.406],
                                 std=[0.229, 0.224, 0.225])
to_tensor = ToTensor()



In [27]:

layer = resnet._modules.get('avgpool')

def vectorize_image(img):
    
    if img.mode != 'RGB':
        img = img.convert('RGB')
    
    t_img = Variable(normalize(to_tensor(scaler(img))).unsqueeze(0))
    embedding = torch.zeros(1, 512, 1, 1)

    def copy_data(m, i, o):
        embedding.copy_(o.data)

    h = layer.register_forward_hook(copy_data)

    resnet(t_img)
    
    h.remove()

    return np.array((embedding.squeeze().numpy()))

def vec_from_url(url: str):
    response = requests.get(url)
    
    with Image.open(BytesIO(response.content)) as im:
        return vectorize_image(im)

In [28]:
url = posts_df['src'][0]
vector = vec_from_url(url)
print(vector)

[0.89793724 0.9165713  0.8688449  0.9434708  0.94386214 0.91181105
 0.90356684 1.050785   0.94063336 0.94308734 0.85249966 0.7925527
 0.938904   0.8911817  0.95860195 0.9560891  0.9055819  1.3543128
 0.85973155 0.8707417  0.9359368  1.0552709  0.9203825  0.9828014
 0.9588462  0.9244528  0.87342864 0.9299794  0.86210096 0.96003675
 0.9726717  0.8735032  0.96777225 0.89472115 0.9649755  0.9119528
 0.94261366 0.9282695  0.9078262  0.9072848  0.9233081  0.8651609
 0.7412457  1.014484   0.86617106 0.90823054 0.88918704 1.2086434
 0.90077126 0.849416   0.9945899  0.9439015  0.8708466  0.96874523
 0.9642156  0.90518934 1.1826112  0.94162893 1.0572268  0.95048696
 0.8965212  0.92021364 0.9607186  0.8686256  0.87181425 0.99643487
 0.82672065 0.93549967 0.95520407 0.86392903 0.87105733 0.79267067
 0.8898691  0.8836879  0.82492405 0.92524207 0.82932025 1.0016358
 0.9010905  0.8582693  0.9166435  1.029429   1.0701919  1.1630875
 0.7909747  0.9020797  0.97063684 0.9205482  0.86163664 0.8929248
 0.8

In [29]:
vec_img = posts_df['src'].apply(vec_from_url)

In [30]:
posts_df['image_vector'] = vec_img

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  posts_df['image_vector'] = vec_img


In [31]:
type(posts_df['image_vector'][0])

numpy.ndarray

In [32]:
posts_df['id'] = posts_df.index.tolist()
posts_df.rename(columns={'concat_vector': 'vector'}, inplace=True)

posts_df.drop(['preprocessed_alt', 'preprocessed_tags'], axis=1).to_parquet('posts.parquet.gzip',
              compression='gzip')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  posts_df['id'] = posts_df.index.tolist()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  posts_df.rename(columns={'concat_vector': 'vector'}, inplace=True)
