In [13]:
import pandas as pd
import numpy as np
from twitterelectionbr.cnn.model_1.predict_gender import predict_gender_simple_img
import imutils

In [2]:
tweets_dilma = pd.read_csv('../raw_data/data/2014/DILMA_ROUSSEFF/#dilma/query_#dilma.csv', parse_dates=True)

In [3]:
tweets_dilma.shape

(37157, 21)

In [12]:
tweets_dilma.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37157 entries, 0 to 37156
Data columns (total 21 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   url              37157 non-null  object
 1   date             37157 non-null  object
 2   content          37157 non-null  object
 3   id               37157 non-null  int64 
 4   reply_count      37157 non-null  int64 
 5   retweet_count    37157 non-null  int64 
 6   like_count       37157 non-null  int64 
 7   quote_count      37157 non-null  int64 
 8   lang             37157 non-null  object
 9   username         37157 non-null  object
 10  displayname      37155 non-null  object
 11  description      27664 non-null  object
 12  verified         37157 non-null  bool  
 13  created          37157 non-null  object
 14  followers_count  37157 non-null  int64 
 15  friends_count    37157 non-null  int64 
 16  location         24131 non-null  object
 17  protected        37157 non-null

In [4]:
def analyse_img_url(url):
    #print('URL ' + url)
    img = imutils.url_to_image(url)
    return predict_gender_simple_img(img)

In [14]:
def transform_dataset(dataset):
    
    dataset_unique = dataset.drop_duplicates(subset=['username']).sample(10)

    dataset_unique['cnn'] = [analyse_img_url(row) for row in dataset_unique['profile_img']]
    dataset_unique['gender']  = dataset_unique['cnn'].apply(lambda score_dict: score_dict.get('gender', np.nan))
    dataset_unique['gender_confidence_score']  = dataset_unique['cnn'].apply(lambda score_dict: score_dict.get('gender_confidence_score', np.nan))

    merged = pd.merge(tweets_dilma, dataset_unique, how='inner', 
                      left_on=['username'],right_on=['username'],
                      suffixes=('', '_delme'))

    # Discard the columns that acquired a suffix
    merged = merged[[c for c in merged.columns if not c.endswith('_delme')]]
    
    return merged

In [15]:
result = transform_dataset(tweets_dilma)

In [16]:
result

Unnamed: 0,url,date,content,id,reply_count,retweet_count,like_count,quote_count,lang,username,...,followers_count,friends_count,location,protected,profile_img,query,crawled_date,cnn,gender,gender_confidence_score
0,https://twitter.com/jessicskull/status/4890667...,2014-07-15 15:19:42,A #Tim está igual a #Dilma. Tá servindo pra n...,489066785397022720,0,0,1,0,pt,jessicskull,...,778,859,,False,https://pbs.twimg.com/profile_images/150133844...,#dilma,2022-06-02,"{'gender': 'Male', 'gender_confidence_score': ...",Male,0.984948
1,https://twitter.com/jessicskull/status/5258093...,2014-10-25 00:41:26,"@g1 #Dilma, se dando BeeeeeeM, mau! Kkkkkk",525809323839340544,0,0,0,0,pt,jessicskull,...,778,859,,False,https://pbs.twimg.com/profile_images/150133844...,#dilma,2022-06-02,"{'gender': 'Male', 'gender_confidence_score': ...",Male,0.984948
2,https://twitter.com/jessicskull/status/4866380...,2014-07-08 22:28:35,Humilhante não eh perde de 7x1. Humilhante s...,486638002316533760,0,1,2,0,pt,jessicskull,...,778,859,,False,https://pbs.twimg.com/profile_images/150133844...,#dilma,2022-06-02,"{'gender': 'Male', 'gender_confidence_score': ...",Male,0.984948
3,https://twitter.com/andreluisantana/status/525...,2014-10-25 02:12:15,"Nas perguntas sobre violência e drogas,concent...",525832179788427264,0,0,0,0,pt,andreluisantana,...,1890,1363,"Salvador-BA, Benin",False,https://pbs.twimg.com/profile_images/147090848...,#dilma,2022-06-02,"{'gender': 'Male', 'gender_confidence_score': ...",Male,0.765194
4,https://twitter.com/Pers3phon9/status/52582064...,2014-10-25 01:26:26,"#Aecio gaguejando perdeu a conexão 4G, ta usan...",525820646614634496,0,0,0,0,pt,Pers3phon9,...,149,658,Brazil,False,https://pbs.twimg.com/profile_images/139450705...,#dilma,2022-06-02,"{'gender': 'Female', 'gender_confidence_score'...",Female,0.852856
5,https://twitter.com/Pers3phon9/status/48554808...,2014-07-05 22:17:39,#dilma vc devia fazer isso!! http://t.co/FGhYm...,485548087574671360,0,0,0,0,pt,Pers3phon9,...,149,658,Brazil,False,https://pbs.twimg.com/profile_images/139450705...,#dilma,2022-06-02,"{'gender': 'Female', 'gender_confidence_score'...",Female,0.852856
6,https://twitter.com/naiany/status/525817036002...,2014-10-25 01:12:05,Não to vendo o debate mas passei na paulista e...,525817036002566145,1,0,0,0,pt,naiany,...,167,862,"São Paulo, SP - Brasil",False,https://pbs.twimg.com/profile_images/563316418...,#dilma,2022-06-02,{},,
7,https://twitter.com/Lucasfrota11/status/512008...,2014-09-16 22:41:35,Sabe quantas pessoal precisa pra destruir todo...,512008423383519235,0,0,0,0,pt,Lucasfrota11,...,361,730,Fortaleza-ce,False,https://pbs.twimg.com/profile_images/104660051...,#dilma,2022-06-02,"{'gender': 'Male', 'gender_confidence_score': ...",Male,0.985694
8,https://twitter.com/InformeRJO/status/51890632...,2014-10-05 23:31:23,RT @Praiadopepe 13 de junho de 2013 (#Vempraru...,518906325313781762,0,1,0,0,pt,InformeRJO,...,186127,193,Rio de Janeiro,False,https://pbs.twimg.com/profile_images/887009084...,#dilma,2022-06-02,"{'gender': 'Male', 'gender_confidence_score': ...",Male,0.993137
9,https://twitter.com/MarinaSilvaBLA/status/5210...,2014-10-11 19:54:35,La em casa todo mundo é 13 !!!!!!!!!!!!!!!!! #...,521026091721711616,0,3,3,0,pt,MarinaSilvaBLA,...,320,608,,False,https://pbs.twimg.com/profile_images/521004775...,#dilma,2022-06-02,"{'gender': 'Female', 'gender_confidence_score'...",Female,0.948449
