## Evaluación diagnostica gitflow

Programar en Python cuatro funciones:

1. Los top 10 tweets más retweeted.
2. Los top 10 usuarios en función a la cantidad de tweets que emitieron.
3. Los top 10 días donde hay más tweets.
4. Top 10 hashtags más usados.
Dataset: https://www.kaggle.com/datasets/prathamsharma123/farmers-protest-tweets-dataset-raw-json

In [1]:
# Import
import json
import re
from heapq import nlargest 
import pandas as pd
from pandas.io.json import json_normalize
import warnings

### Función top retweeted

In [2]:
def top_retweets(df, ntop = 10 ):
    """ función que calcula el top N° ntop
    de tweets más retweteados de data
    """
    top = df.nlargest(n=ntop, columns=['retweetCount'])
    return top

### Funcion top users

In [11]:
def top_users(df, ntop = 10 ):
    """ función que calcula el top N° ntop
    de usuarios que han hecho más tweets
    """
    df1 = df.groupby(['username']).count()
    df1 = df1.reset_index()
    df1['cant_tweets'] = df1['tweetId']
    top = df1.nlargest(n=ntop, columns=['cant_tweets'])
    cols = ['username','cant_tweets']
    top_clean = top[cols]
    top_clean = top_clean.reset_index()
    return top_clean

### Función top days

In [19]:
def top_days(df, ntop = 10 ):
    """ función que calcula el top N° ntop
    de dias que se han hecho más tweets
    """
    df1 = df.groupby(['day']).count()
    df1 = df1.reset_index()
    df1['cant_tweets'] = df1['tweetId']
    top = df1.nlargest(n=ntop, columns=['cant_tweets'])
    cols = ['day','cant_tweets']
    top_clean = top[cols]
    top_clean = top_clean.reset_index()
    return top_clean

### Función top hashtags

In [30]:
def top_hashtags(df, ntop = 10 ):
    top = 3
    freq = {}
    regex = "#(\w+)"
    for tweet in df["renderedContent"]:
        hashtag_list = re.findall(regex, tweet)
        for hashtag in hashtag_list:
            if hashtag in freq.keys():
                freq[hashtag] += 1
            else: 
                freq[hashtag] = 1
    top = nlargest(ntop, freq, key = freq.get) 
    return top

### Preprocesamiento dataset

In [None]:
def process_data(path_archivo):
    raw_tweets = pd.read_json(path_archivo, lines=True)
    raw_tweets = raw_tweets[raw_tweets['lang']=='en']
    # Normalize 'user' field
    users = json_normalize(raw_tweets['user'])
    users.drop(['description', 'linkTcourl'], axis=1, inplace=True)
    users.rename(columns={'id':'userId', 'url':'profileUrl'}, inplace=True)
    # Create DataFrame and remove duplicates
    users = pd.DataFrame(users)
    users.drop_duplicates(subset=['userId'], inplace=True)
    # Add column for 'userId'
    user_id = []
    user_name = []
    for user in raw_tweets['user']:
        uid = user['id']
        name = user['username']
        user_id.append(uid)
        user_name.append(name)
    raw_tweets['userId'] = user_id
    raw_tweets['username'] = user_name
    # Remove less important columns
    cols = ['url', 'date', 'renderedContent', 'id', 'username', 'userId', 'replyCount', 'retweetCount', 'likeCount', 'quoteCount', 'source', 'media', 'retweetedTweet', 'quotedTweet', 'mentionedUsers']
    tweets = raw_tweets[cols]
    tweets.rename(columns={'id':'tweetId', 'url':'tweetUrl'}, inplace=True)
    # Convert to DataFrame, remove duplicates and keep only English tweets
    tweets = pd.DataFrame(tweets)
    tweets.drop_duplicates(subset=['tweetId'], inplace=True)
    return tweets