In [1]:
import json
import requests
import pickle
import numpy as np
import matplotlib.pyplot as plt
import matplotlib

from tokens import TOKEN_KEY, TOKEN_CX

In [2]:
def get_google_search_results(string, siteSearch="", searchType="exact", page=0, hl="pt-BR", lr="lang_pt"):
    url = f'https://customsearch.googleapis.com/customsearch/v1'
    params = {
        "key": TOKEN_KEY,
        "cx": TOKEN_CX,
        "hl": hl,
        "lr": lr,
        "start": (page*10+1)
    }

    # Fill params according to search type. 'exact' uses 'exactTerms' and 'any' uses 'q',
    # which in theory uses results that do not have the exact phrasing as in the string
    if searchType == "exact":
        params["exactTerms"] = string
    elif searchType == "any":
        params["q"] = string
    else:
        raise Exception("Invalid search type")

    # If siteSearch is not empty, search only in the specified site
    if siteSearch:
        params["siteSearch"] = siteSearch
        params["siteSearchFilter"] = "i"

    # Make request
    response = requests.get(url, params=params).json()

    # Get total number of results if the request was successful
    try:
        total_results = response['searchInformation']['totalResults']
    except:
        print(params)
        print(response)
        raise Exception("Erro na requisição")

    return response, int(total_results)

In [3]:
def get_count_from_file(filepath):
    with open(filepath, "rb") as f:
        data = pickle.load(f)

    counts = []
    for k in data:
        counts.append([c for _, c in k])

    return np.array(counts)

In [4]:
def plot_file(filename):
    with open(filename, "rb") as f:
        data = pickle.load(f)

    counts = []
    for k in data:
        counts.append([c for _, c in k])
    counts = np.array(counts)
    count = np.mean(counts, axis=1)

    plt.figure(figsize=(8, 4))
    plt.bar(range(len(count)), count, color="#7ed9fc")
    plt.fill_between(range(len(count)), count - np.std(counts, axis=1), count + np.std(counts, axis=1), color="black", linestyle="--", alpha=0.5)
    plt.plot(range(len(count)), count, color="black")
    plt.ylim(0, 1.2 * max(count[3:]))
    plt.title(filename)
    plt.yticks([])
    # plt.yscale('log')
    plt.show()

In [5]:
def display_snippets(filename, pos=0):
    with open(filename, "rb") as f:
        data = pickle.load(f)

    for i, r in enumerate(data):
        m = r[0][0]['items'][pos]
        print(f"-" * 50)
        print(f"qt: {i+1}, total: {r[0][0]['searchInformation']['totalResults']}")
        print(m['title'])
        print("\t", m['snippet'])

In [6]:
def download_data(laugh_atom='k', start=1, end=25, siteSearch="", searchType="exact", hl="pt-BR", lr="lang_pt", simulations=1):
    all_data = []
    filepath = f"data/{laugh_atom}_{siteSearch.split('.')[0]}_{simulations}s_{start}-{end}_{hl}"

    for i in range(start, end+1):
        string = laugh_atom * i
        print(f"[{i} / {end}] Processing {string}...")

        data = []
        for j in range(simulations):
            print(f"  [{j + 1} / {simulations}] Making request for {string}...")
            response, total_results = get_google_search_results(string, siteSearch=siteSearch, searchType=searchType, hl=hl, lr=lr)
            data.append((response, total_results))

        all_data.append(data)

        with open(filepath + ".pkl", "wb") as f:
            pickle.dump(all_data, f)

        with open(filepath + "_count.json", "w") as f:
            count = get_count_from_file(filepath + ".pkl").tolist()
            json.dump(count, f, indent=4)

        with open(filepath + "_texts.json", "w") as f:
            texts = [[d['link'] for d in k[0][0]['items'] if 'items' in k[0][0]] for k in all_data]
            json.dump(texts, f, indent=4)

In [7]:
# download_data("w", start=1, end=50, siteSearch="twitter.com", simulations=2, hl="jw", lr="lang_ja")

In [8]:
# download_data("5", start=1, end=50, siteSearch="twitter.com", simulations=2, hl="ti", lr="lang_th")

In [9]:
# download_data("ho", start=1, end=50, siteSearch="twitter.com", simulations=2)

In [10]:
# download_data("k", start=51, end=100, siteSearch="twitter.com", simulations=2)

In [11]:
# download_data("xa", start=1, end=50, siteSearch="twitter.com", simulations=2, hl="ru", lr="lang_ru")

In [12]:
# download_data("wk", start=1, end=50, siteSearch="twitter.com", simulations=2, hl="id", lr="lang_id")

In [14]:
download_data("ha", start=1, end=50, siteSearch="twitter.com", simulations=2)

[1 / 50] Processing ha...
  [1 / 2] Making request for ha...
  [2 / 2] Making request for ha...
[2 / 50] Processing haha...
  [1 / 2] Making request for haha...
  [2 / 2] Making request for haha...
[3 / 50] Processing hahaha...
  [1 / 2] Making request for hahaha...
  [2 / 2] Making request for hahaha...
[4 / 50] Processing hahahaha...
  [1 / 2] Making request for hahahaha...
  [2 / 2] Making request for hahahaha...
[5 / 50] Processing hahahahaha...
  [1 / 2] Making request for hahahahaha...
  [2 / 2] Making request for hahahahaha...
[6 / 50] Processing hahahahahaha...
  [1 / 2] Making request for hahahahahaha...
  [2 / 2] Making request for hahahahahaha...
[7 / 50] Processing hahahahahahaha...
  [1 / 2] Making request for hahahahahahaha...
  [2 / 2] Making request for hahahahahahaha...
[8 / 50] Processing hahahahahahahaha...
  [1 / 2] Making request for hahahahahahahaha...
  [2 / 2] Making request for hahahahahahahaha...
[9 / 50] Processing hahahahahahahahaha...
  [1 / 2] Making reque

Exception: Erro na requisição