## Evaluación diagnostica gitflow

Programar en Python cuatro funciones:

1. Los top 10 tweets más retweeted.
2. Los top 10 usuarios en función a la cantidad de tweets que emitieron.
3. Los top 10 días donde hay más tweets.
4. Top 10 hashtags más usados.
Dataset: https://www.kaggle.com/datasets/prathamsharma123/farmers-protest-tweets-dataset-raw-json

In [1]:
# Import
import json
import re
from heapq import nlargest 
import pandas as pd
from pandas.io.json import json_normalize
import warnings
warnings.filterwarnings("ignore")

### Función top retweeted

In [2]:
def top_retweets(df, ntop = 10 ):
    """ función que calcula el top N° ntop
    de tweets más retweteados de data
    """
    top = df.nlargest(n=ntop, columns=['retweetCount'])
    return top

### Funcion top users

In [3]:
def top_users(df, ntop = 10 ):
    """ función que calcula el top N° ntop
    de usuarios que han hecho más tweets
    """
    df1 = df.groupby(['username']).count()
    df1 = df1.reset_index()
    df1['cant_tweets'] = df1['tweetId']
    top = df1.nlargest(n=ntop, columns=['cant_tweets'])
    cols = ['username','cant_tweets']
    top_clean = top[cols]
    top_clean = top_clean.reset_index()
    return top_clean

### Función top days

In [4]:
def top_days(df, ntop = 10 ):
    """ función que calcula el top N° ntop
    de dias que se han hecho más tweets
    """
    df1 = df.groupby(['day']).count()
    df1 = df1.reset_index()
    df1['cant_tweets'] = df1['tweetId']
    top = df1.nlargest(n=ntop, columns=['cant_tweets'])
    cols = ['day','cant_tweets']
    top_clean = top[cols]
    top_clean = top_clean.reset_index()
    return top_clean

### Función top hashtags

In [5]:
def top_hashtags(df, ntop = 10 ):
    top = 3
    freq = {}
    regex = "#(\w+)"
    for tweet in df["renderedContent"]:
        hashtag_list = re.findall(regex, tweet)
        for hashtag in hashtag_list:
            if hashtag in freq.keys():
                freq[hashtag] += 1
            else: 
                freq[hashtag] = 1
    top = nlargest(ntop, freq, key = freq.get) 
    return top

### Preprocesamiento dataset

In [11]:
def parse_date(row):
    ''' función que setea el tiempo a 00:00:00'''
    row = row.replace(hour=0, minute=0, second=0)
    return row

In [12]:
def process_data(path_archivo):
    raw_tweets = pd.read_json(path_archivo, lines=True)
    raw_tweets = raw_tweets[raw_tweets['lang']=='en']
    # Normalize 'user' field
    users = json_normalize(raw_tweets['user'])
    users.drop(['description', 'linkTcourl'], axis=1, inplace=True)
    users.rename(columns={'id':'userId', 'url':'profileUrl'}, inplace=True)
    # Create DataFrame and remove duplicates
    users = pd.DataFrame(users)
    users.drop_duplicates(subset=['userId'], inplace=True)
    # Add column for 'userId'
    user_id = []
    user_name = []
    for user in raw_tweets['user']:
        uid = user['id']
        name = user['username']
        user_id.append(uid)
        user_name.append(name)
    raw_tweets['userId'] = user_id
    raw_tweets['username'] = user_name
    # Remove less important columns
    cols = ['url', 'date', 'renderedContent', 'id', 'username', 'userId', 'replyCount', 'retweetCount', 'likeCount', 'quoteCount', 'source', 'media', 'retweetedTweet', 'quotedTweet', 'mentionedUsers']
    tweets = raw_tweets[cols]
    tweets.rename(columns={'id':'tweetId', 'url':'tweetUrl'}, inplace=True)
    # Convert to DataFrame, remove duplicates and keep only English tweets
    tweets = pd.DataFrame(tweets)
    tweets.drop_duplicates(subset=['tweetId'], inplace=True)
    tweets['day'] = tweets['date'].map(parse_date)
    return tweets

### Main

In [7]:
def main(numero_funcion):
    path_archivo = "../basededatos/farmers-protest-tweets-2021-03-5.json"
    df = process_data(path_archivo)
    if 1 == numero_funcion:
        print("\n Calculando el top 10 tweets más retweeted ...")
        return top_retweets(df)
    if 2 == numero_funcion:
        print("\n Calculando el top 10 usuarios en función a la cantidad de tweets que emitieron")
        return top_users(df)
    if 3 == numero_funcion:
        print("\n Calculando el top 10 días donde hay más tweets")
        return top_days(df)
    if 4 == numero_funcion:
        print("\n Calculando el tp 10 hashtags más usados")
        return top_hashtags(df)
        
    

In [8]:
main(1)


 Calculando el top 10 tweets más retweeted ...


Unnamed: 0,tweetUrl,date,renderedContent,tweetId,username,userId,replyCount,retweetCount,likeCount,quoteCount,source,media,retweetedTweet,quotedTweet,mentionedUsers
408128,https://twitter.com/rihanna/status/13566258896...,2021-02-02 15:29:51+00:00,why aren’t we talking about this?! #FarmersPro...,1356625889602199552,rihanna,79293791,163065,315547,944307,45832,"<a href=""http://twitter.com/download/iphone"" r...",,,,
395142,https://twitter.com/GretaThunberg/status/13566...,2021-02-02 20:04:01+00:00,We stand in solidarity with the #FarmersProtes...,1356694884615340037,GretaThunberg,1006419421244678144,49793,103957,319363,13815,"<a href=""http://twitter.com/download/iphone"" r...",,,,
266196,https://twitter.com/GretaThunberg/status/13572...,2021-02-04 10:59:01+00:00,I still #StandWithFarmers and support their pe...,1357282507616645122,GretaThunberg,1006419421244678144,39596,67694,234676,10587,"<a href=""http://twitter.com/download/iphone"" r...",,,,
366579,https://twitter.com/miakhalifa/status/13568483...,2021-02-03 06:14:01+00:00,"“Paid actors,” huh? Quite the casting director...",1356848397899112448,miakhalifa,2835653131,15569,35921,139959,5681,"<a href=""http://twitter.com/download/iphone"" r...",[{'previewUrl': 'https://pbs.twimg.com/media/E...,,,
372793,https://twitter.com/miakhalifa/status/13568277...,2021-02-03 04:51:48+00:00,What in the human rights violations is going o...,1356827705161879553,miakhalifa,2835653131,9082,26972,99227,4606,"<a href=""http://twitter.com/download/iphone"" r...",[{'previewUrl': 'https://pbs.twimg.com/media/E...,,,
314192,https://twitter.com/TeamJuJu/status/1357048037...,2021-02-03 19:27:19+00:00,"Happy to share that I’ve donated $10,000 to pr...",1357048037302960129,TeamJuJu,733170759829327874,7683,23251,59248,4082,"<a href=""http://twitter.com/download/iphone"" r...",,,,
215034,https://twitter.com/BobBlackman/status/1357755...,2021-02-05 18:19:19+00:00,There has been much social media coverage arou...,1357755699162398720,BobBlackman,805185025,1845,20132,42779,1592,"<a href=""https://mobile.twitter.com"" rel=""nofo...",[{'previewUrl': 'https://pbs.twimg.com/media/E...,,,
398011,https://twitter.com/vanessa_vash/status/135668...,2021-02-02 19:09:23+00:00,Farmers feed the world. Fight for them. Protec...,1356681136655769605,vanessa_vash,1134059457191776257,1301,18744,67986,820,"<a href=""http://twitter.com/download/android"" ...",,,,
325261,https://twitter.com/kylekuzma/status/135700972...,2021-02-03 16:55:04+00:00,Should be talking about this! #FarmersProtest\...,1357009721090138112,kylekuzma,272616327,4167,17368,39653,2505,"<a href=""http://twitter.com/download/iphone"" r...",,,,
163689,https://twitter.com/AmandaCerny/status/1359013...,2021-02-09 05:36:49+00:00,To all of my influencer/celeb friends- read up...,1359013362881994752,AmandaCerny,104856942,2028,15677,81375,813,"<a href=""http://twitter.com/download/iphone"" r...",,,,


In [9]:
main(2)


 Calculando el top 10 usuarios en función a la cantidad de tweets que emitieron


Unnamed: 0,index,username,cant_tweets
0,64326,harjot_tweeting,7134
1,89247,tasveersandhu,2088
2,85554,shells_n_petals,1991
3,70038,jot__b,1841
4,81848,rebelpacifist,1803
5,82836,rumsomal,1722
6,19687,Iamjazzie96,1491
7,22214,Jass_k_G,1458
8,11983,DigitalKisanBot,1453
9,93008,z_khalique007,1446


In [13]:
main(3)


 Calculando el top 10 días donde hay más tweets


Unnamed: 0,index,day,cant_tweets
0,2,2021-02-03 00:00:00+00:00,83403
1,3,2021-02-04 00:00:00+00:00,58300
2,4,2021-02-05 00:00:00+00:00,33165
3,1,2021-02-02 00:00:00+00:00,28440
4,5,2021-02-06 00:00:00+00:00,22298
5,6,2021-02-07 00:00:00+00:00,11244
6,8,2021-02-09 00:00:00+00:00,9269
7,7,2021-02-08 00:00:00+00:00,8863
8,9,2021-02-10 00:00:00+00:00,7938
9,10,2021-02-11 00:00:00+00:00,5668


In [15]:
print(main(4))


 Calculando el tp 10 hashtags más usados
['FarmersProtest', 'IStandWithFarmers', 'farmersprotest', 'IndianFarmersHumanRights', 'FarmersAreIndia', 'StandWithFarmers', 'Rihanna', 'FarmersProtests', 'Farmers', 'shameonbollywood']
