In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import sys
from sklearn import feature_selection
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, ENGLISH_STOP_WORDS
import nltk
from nltk import word_tokenize
from nltk.util import ngrams
from collections import Counter
%matplotlib inline
np.set_printoptions(threshold=sys.maxsize)
import spacy
import collections
import requests
import os
from PIL import Image
import time
import io
import hashlib

In [2]:
df = pd.read_json("cah-cards-full.json")
df.drop_duplicates(subset ="text")

white_sentences = np.array([])
for index, row in df.iterrows(): 
    if row['cardType'] == 'A':
        white_sentences = np.append(white_sentences, row['text'])

black_sentences = np.array([])
for index, row in df.iterrows(): 
    if row['cardType'] == 'Q' and row['numAnswers'] < 2:
        black_sentences = np.append(black_sentences, row['text'])

In [3]:
print("Number white sentences :"+str(len(white_sentences)))
print("Number black sentences :"+str(len(black_sentences)))

Number white sentences :961
Number black sentences :161


In [4]:
import en_core_web_lg
nlp = en_core_web_lg.load()

In [5]:
def preprocess_text(text, flg_lemm=True):
    ## clean (convert to lowercase and remove punctuations and characters and then strip)
    text = re.sub(r'[^\w\s]', '', str(text).lower().strip())
    text = re.sub(' \d+', '', text)
    text = nlp(np.str(text))
    ## remove Stopwords
    lst_text = [token for token in text if not token.is_stop]
    if flg_lemm:
        lst_text = [token.lemma_ for token in lst_text]
    return ' '.join(lst_text)

In [6]:
func = lambda x: preprocess_text(x, flg_lemm=True)
vfunc = np.vectorize(func)

In [7]:
white_sentences_ = vfunc(white_sentences)

In [8]:
black_sentences_ = vfunc(black_sentences)

In [9]:
white_sentences_ = white_sentences_[white_sentences_!='']
black_sentences_ = black_sentences_[black_sentences_!='']

In [10]:
# count the number of words
wc_white = collections.Counter(' '.join(white_sentences_).split())
wc_black = collections.Counter(' '.join(black_sentences_).split())

# Mapping from index to word : that's the vocabulary
vocab_inv_white = [x[0] for x in wc_white.most_common()]
vocab_inv_white = list(sorted(vocab_inv_white))

# Mapping from index to word : that's the vocabulary
vocab_inv_black = [x[0] for x in wc_black.most_common()]
vocab_inv_black = list(sorted(vocab_inv_black))

# Mapping from word to index
vocab_white = {x: i for i, x in enumerate(vocab_inv_white)}
words_white = [x[0] for x in wc_white.most_common()]

# Mapping from word to index
vocab_black = {x: i for i, x in enumerate(vocab_inv_black)}
words_black = [x[0] for x in wc_black.most_common()]

#size of the vocabulary
vocab_white_size = len(words_white)
print("vocab size white: ", vocab_white_size)
vocab_black_size = len(words_black)
print("vocab size black: ", vocab_black_size)

vocab size white:  1473
vocab size black:  502


In [11]:
DRIVER_PATH = '/Users/sofiaadornibraccesi/Downloads/Scraping/chromedriver'

In [12]:
from selenium import webdriver
wd = webdriver.Chrome(executable_path=DRIVER_PATH)

In [13]:
#wd.get('https://google.com')

In [14]:
#search_box = wd.find_element_by_css_selector('input.gLFyf')
#search_box.send_keys('Dogs')

In [15]:
def fetch_image_urls(query:str, max_links_to_fetch:int, wd:webdriver, sleep_between_interactions:int=1):
    def scroll_to_end(wd):
        wd.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(sleep_between_interactions)
        
    # build the google query
    search_url = "https://www.google.com/search?safe=off&site=&tbm=isch&source=hp&q={q}&oq={q}&gs_l=img"

    # load the page
    wd.get(search_url.format(q=query))

    image_urls = set()
    image_count = 0
    results_start = 0
    while image_count < max_links_to_fetch:
        scroll_to_end(wd)

        # get all image thumbnail results
        thumbnail_results = wd.find_elements_by_css_selector("img.Q4LuWd")
        number_results = len(thumbnail_results)
        
        print(f"Found: {number_results} search results. Extracting links from {results_start}:{number_results}")
        
        for img in thumbnail_results[results_start:number_results]:
            # try to click every thumbnail such that we can get the real image behind it
            try:
                img.click()
                time.sleep(sleep_between_interactions)
            except Exception:
                continue

            # extract image urls    
            actual_images = wd.find_elements_by_css_selector('img.n3VNCb')
            for actual_image in actual_images:
                if actual_image.get_attribute('src') and 'http' in actual_image.get_attribute('src'):
                    image_urls.add(actual_image.get_attribute('src'))

            image_count = len(image_urls)

            if len(image_urls) >= max_links_to_fetch:
                print(f"Found: {len(image_urls)} image links, done!")
                break
        else:
            print("Found:", len(image_urls), "image links, looking for more ...")
            time.sleep(30)
            return
            load_more_button = wd.find_element_by_css_selector(".mye4qd")
            if load_more_button:
                wd.execute_script("document.querySelector('.mye4qd').click();")

        # move the result startpoint further down
        results_start = len(thumbnail_results)

    return image_urls

In [16]:
def persist_image(folder_path:str,url:str, num=0):
    try:
        image_content = requests.get(url).content

    except Exception as e:
        print(f"ERROR - Could not download {url} - {e}")

    try:
        image_file = io.BytesIO(image_content)
        image = Image.open(image_file).convert('RGB')
        file_path = os.path.join(folder_path, str(num)+'.jpg')
        with open(file_path, 'wb') as f:
            image.save(f, "JPEG", quality=85)
        print(f"SUCCESS - saved {url} - as {file_path}")
    except Exception as e:
        print(f"ERROR - Could not save {url} - {e}")

In [17]:
def search_and_download(search_term:str,driver_path:str,target_path='/Users/sofiaadornibraccesi/Downloads/Scraping/images',number_images=15):
    target_folder = os.path.join(target_path,'_'.join(search_term.lower().split(' ')))

    if not os.path.exists(target_folder):
        os.makedirs(target_folder)

        with webdriver.Chrome(executable_path=driver_path) as wd:
            res = fetch_image_urls(search_term, number_images, wd=wd, sleep_between_interactions=0.5)
    
        num = 0
        if res:
            for elem in res:
                persist_image(target_folder,elem, num)
                num +=1
        else: 
            print("None type for links.")

In [18]:
#white_sentences_= white_sentences_[7:]

In [19]:
#white_sentences_[7]

In [20]:
for search_term in white_sentences_: 
    search_and_download(search_term = search_term, driver_path = DRIVER_PATH)

In [21]:
target_path='/Users/sofiaadornibraccesi/Downloads/Scraping/images'
for search_term in white_sentences_: 
    target_folder = os.path.join(target_path,'_'.join(search_term.lower().split(' ')))
    if not os.listdir(target_folder):
        print(search_term)

In [23]:
target_path='/Users/sofiaadornibraccesi/Downloads/Scraping/images'
for search_term in white_sentences_: 
    target_folder = os.path.join(target_path,'_'.join(search_term.lower().split(' ')))
    if len([name for name in os.listdir(target_folder) if os.path.isfile(os.path.join(target_folder, name))])<8: 
        print(search_term)

incest
kkk
heteronormativity
dick fingers
president george w bush
embryonic stem cell
auschwitz
micropenis
clandestine butt scratch
forget alamo
vagina
bite rich person
get crush piano
money
tombus talk rhombus
giggle slurp milkshake
truck
long tongue world
wear high heel
go commando
awful haircut


In [None]:
for search_term in black_sentences_: 
    search_and_download(search_term = search_term, driver_path = DRIVER_PATH)

In [None]:
wd.quit()