In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from textblob import TextBlob
import re, random

class InPoDa:
    def __init__(self, json_path):
        df_tweets = pd.read_json(json_path)
        del df_tweets['_id'], df_tweets['public_metrics'], df_tweets['conversation_id'], df_tweets['geo'], df_tweets['lang'], df_tweets['created_at'], df_tweets['attachments']
        hashtags = []
        mentions = []
        urls = []
        for i in df_tweets['entities']:
            try:
                hashtags.append(list(dict.fromkeys([j['tag'] for j in i['hashtags']])))
            except:
                hashtags.append([])

            try:
                mentions.append([j['id'] for j in i['mentions']])
            except:
                mentions.append([])
    
            try:
                urls.append([j['url'] for j in i['urls']])
            except:
                urls.append([])

        cleaned_text = []
        for r,i in enumerate(df_tweets['text']):
            for j in urls[r]:
                i = i.replace(j,'')
            cleaned_text.append(re.sub(r'#\w+|@\w+|(\r\n|\r|\n)|[^a-zA-Zàâçéèêëîïôûùüæœ\d\s]','',i))

        topics = []
        for r,i in enumerate(df_tweets['context_annotations']):
            try:
                topics.append(list(dict.fromkeys([j['entity']['name'] for j in i])))
            except:
                try:
                    words = sorted(cleaned_text[r].split(),key=len,reverse=True)
                    topics.append([random.choice(words[:len(words)//3])])
                except:
                    topics.append([])

        sentiments = []
        for i in cleaned_text:
            sentiments.append(TextBlob(i).sentiment)

        df_tweets['hashtags'] = hashtags
        df_tweets['mentions'] = mentions
        df_tweets['urls'] = urls
        df_tweets['topics'] = topics
        df_tweets['cleaned_text'] = cleaned_text
        df_tweets['sentiments'] = sentiments
        del df_tweets['context_annotations'], df_tweets['entities']
        self.dataframe = df_tweets
        self.data = df_tweets.to_dict('records')
        
        # Fenêtre de dialogue #
        print(50*'*'+"\nInstance InPoDa initialisée avec succès\n"+50*'*'+'\n%d tweets détectés dans "'%len(self.data)+json_path+'"')
    
    def _show(self, xy):
        x, y = xy
        freq_series = pd.Series(y)
        plt.figure(figsize=(12, 8))
        fig = freq_series.plot(kind='bar')
        fig.set_title('Amount Frequency')
        fig.set_xlabel('Elements')
        fig.set_ylabel('Frequency')
        fig.set_xticklabels(x)
    
    def top_k_users(self,k:int):
        top = self.dataframe['author_id'].value_counts().iloc[0:k]
        self._show([list(top.index), list(top.values)])
    
    def _count_df_occ(self,arg:str,explore_list:bool):
        occ = {}
        for i in self.dataframe[arg]:
            if explore_list:
                for j in i:
                    if j in occ.keys():
                        occ[j] += 1
                    else:
                        occ[j] = 1
            else:
                if i in occ.keys():
                    occ[i] += 1
                else:
                    occ[i] = 1
        return occ
    
    def _top_k(self,arg:str,k:int,explore_list=True):
        l1, l2 = [], []
        dictionnaire = self._count_df_occ(arg, explore_list)
        for key in sorted(dictionnaire, key=dictionnaire.get, reverse=True):
            l1.append(key)
            l2.append(dictionnaire[key])
        return [l1[:k],l2[:k]]

    def top_k_hashtags(self,k:int):
        self._show(self._top_k('hashtags',k))
    
    def top_k_mentions(self, k:int):
        self._show(self._top_k('mentions',k))
    
    def top_k_topics(self, k:int):
        self._show(self._top_k('topics',k))
    
    def nb_pubs_per_user(self):
        self._show(self._top_k('author_id',len(self.data), False))
    
    def nb_pubs_per_hashtag(self):
        self._show(self._top_k('hashtags',len(self.data)))
    
    def nb_pubs_per_topic(self):
        self._show(self._top_k('topics',len(self.data)))
    
    def tweets_of(self,user:int):
        output = []
        for tweet in self.data:
            if tweet['author_id'] == user:
                output.append(tweet)
        return output
    
    def tweets_that_mention(self,user:int):
        output = []
        user = str(user)
        for tweet in self.data:
            for m in tweet['mentions']:
                if m == user:
                    output.append(tweet)
        return output

    def users_that_mention_hashtag(self,tag:str):
        output = []
        for tweet in self.data:
            for t in tweet['hashtags']:
                if t == tag:
                    output.append(tweet['author_id'])
        return list(dict.fromkeys(output))
    
    def users_mentionned_by(self,user:int):
        output = []
        for tweet in self.data:
            if tweet['author_id'] == user:
                output = output + tweet['mentions']
        return list(dict.fromkeys(output))

In [2]:
path = "versailles_tweets_100.json"
instance1 = InPoDa(path)

**************************************************
Instance InPoDa initialisée avec succès
**************************************************
20 tweets détectés dans "versailles_tweets_100.json"


In [7]:
#instance1.top_k_users(3)
#instance1.top_k_hashtags(4)
#instance1.top_k_mentions(4)
#instance1.top_k_topics(3)
#instance1.nb_pubs_per_user()
#instance1.nb_pubs_per_hashtag()
#instance1.nb_pubs_per_topic()
#instance1.tweets_of(717025418)
#instance1.tweets_that_mention(19811019)
#instance1.users_that_mention_hashtag('CIV')
#instance1.users_mentionned_by(992904738516717568)