In [2]:
import os
import pandas as pd
import numpy as np
import glob
from drive.MyDrive.twiitter_elections_models.cnn.model_1.predict_gender import predict_gender_simple_img
import imutils
from urllib.error import HTTPError
import concurrent.futures

In [3]:
def get_files(directory, output_directory, extension = 'csv'):
    source_files = [i for i in glob.glob(directory + '*.{}'.format(extension))]
    output_files = [i for i in glob.glob(output_directory + '*.{}'.format(extension))]

    return [item for item in source_files if item not in output_files]

def analyse_img_url(url, images_cache):

    if not isinstance(url, str):
      return {}

    url = str(url)

    if url in images_cache:
      #print('cache')
      return images_cache.get(url)

    try:
      #print(f'Analisando {url}')
      img = imutils.url_to_image(url)
      #print('Download de img ok')
      predict_result = predict_gender_simple_img(img)
      #print(f'predict_result =: {predict_result}')
      images_cache[url] = predict_result

      return predict_result
    except HTTPError as err:
      return {}

    return {}

def parse_dataset(dataset, images_cache):
    results = dataset.profile_img.apply(lambda x: analyse_img_url(x, images_cache))
    return pd.DataFrame(results.to_list())

def transform_csv(file):
  #extract filename
  filename = os.path.splitext(os.path.basename(file))[0]
  #images url cache
  images_cache = {}
  #read file
  data = pd.read_csv(file)
  #parse dataset
  print(f'Analisando dataset({filename})[{data.shape[0]}]')
  data_cnn = parse_dataset(data, images_cache)
  #join new columns to original dataset
  return data.join(data_cnn)

In [4]:

output_path = '/content/drive/MyDrive/twiitter_elections_data/cnn/missing/jair/'
files = get_files('/content/drive/MyDrive/twiitter_elections_data/pure/missing/jair/', output_path)

print(f'Arquivos {files}')

with concurrent.futures.ThreadPoolExecutor(max_workers = 50) as executor:
  future_to_file = {executor.submit(transform_csv, file): file for file in files}
  for future in concurrent.futures.as_completed(future_to_file):
      file = future_to_file[future]
      try:
          dataset = future.result()
      except Exception as exc:
          print(f'{file} exception {exc}')
      else:
        #extract filename
        filename = os.path.splitext(os.path.basename(file))[0]

        print(f'Dataset({filename}) analisado [{dataset.shape[0]}]')
        #save the new dataset
        dataset.to_csv(output_path + filename + '.csv')
        print(f'Dataset({filename}) salvo [{file}]')
        

[1;30;43mA saída de streaming foi truncada nas últimas 5000 linhas.[0m
predict_result =: {'gender': 'Male', 'gender_confidence_score': -5.261525}
predict_result =: {'gender': 'Female', 'gender_confidence_score': 0.8550162}
predict_result =: {'gender': 'Female', 'gender_confidence_score': 0.7030949}
predict_result =: {'gender': 'Female', 'gender_confidence_score': 0.7030949}
predict_result =: {'gender': 'Female', 'gender_confidence_score': 0.7030949}
predict_result =: {'gender': 'Female', 'gender_confidence_score': 0.7030949}
predict_result =: {'gender': 'Female', 'gender_confidence_score': 0.7030949}
predict_result =: {'gender': 'Female', 'gender_confidence_score': 0.7030949}predict_result =: {'gender': 'Female', 'gender_confidence_score': 0.7030949}

predict_result =: {'gender': 'Female', 'gender_confidence_score': 0.7030949}
predict_result =: {'gender': 'Female', 'gender_confidence_score': 0.7030949}
predict_result =: {'gender': 'Female', 'gender_confidence_score': 0.7030949}predic

In [24]:
# for file in get_files('/content/drive/MyDrive/twiitter_elections_data/pure/'):
#    transform_csv(file,'/content/drive/MyDrive/twiitter_elections_data/cnn/')

# transform_csv('/content/drive/MyDrive/twiitter_elections_data/pure/query_(from:jairbolsonaro).csv',
#               '/content/drive/MyDrive/twiitter_elections_data/cnn/')