## Imports

In [4]:
import kagglehub
from kagglehub import KaggleDatasetAdapter
import os
import pandas as pd
import numpy as np
import ast
import re

## Download Dataset

In [2]:
# Download latest version
path = kagglehub.dataset_download("ahmedshahriarsakib/top-1000-twitter-celebrity-tweets-embeddings")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/ahmedshahriarsakib/top-1000-twitter-celebrity-tweets-embeddings?dataset_version_number=3...


100%|██████████| 171M/171M [00:29<00:00, 6.02MB/s] 

Extracting files...





Path to dataset files: C:\Users\tejas\.cache\kagglehub\datasets\ahmedshahriarsakib\top-1000-twitter-celebrity-tweets-embeddings\versions\3


In [18]:
BASE_PATH = os.path.join(os.path.join(path, "twitter-celebrity-tweets-data"), "twitter-celebrity-tweets-data")
# List to store dataframes from each celebrity
all_celebrity_data = []

# Iterate through each file in the specified directory
for filename in os.listdir(BASE_PATH):
    if filename.endswith('.csv'):
        celebrity_name = os.path.splitext(filename)[0]
        
        file_path = os.path.join(BASE_PATH, filename)
        try:
            # Read the CSV file
            df = pd.read_csv(file_path)
            df.head()
    
            # Determine the number of rows to sample
            num_rows = len(df)
            sample_size = min(num_rows, 50)

            if sample_size > 0:
                # Randomly sample 'sample_size' rows
                sampled_df = df.sample(n=sample_size, random_state=42) # Using random_state for reproducibility
                sampled_df['name'] = celebrity_name
                all_celebrity_data.append(sampled_df)
                
        except pd.errors.EmptyDataError:
            print(f"Warning: {filename} is empty and was skipped.")
        except Exception as e:
            print(f"Error processing {filename}: {e}")

In [19]:
# Concatenate all sampled dataframes into a single dataframe
if all_celebrity_data:
    final_df = pd.concat(all_celebrity_data, ignore_index=True)
    print("\n--- Data Extraction Complete ---")
    print(f"Final DataFrame shape: {final_df.shape}")
    print("\nFirst 5 rows of the combined DataFrame:")
    print(final_df.head())
else:
    print("\nNo CSV files found or processed. Final DataFrame is empty.")


--- Data Extraction Complete ---
Final DataFrame shape: (45031, 4)

First 5 rows of the combined DataFrame:
            twitter_id                       date  \
0  1265675160813780996  2020-05-27 16:04:08+00:00   
1  1352116023261097987  2021-01-21 04:49:15+00:00   
2  1305913958898053121  2020-09-15 16:58:45+00:00   
3  1515766356548984835  2022-04-17 18:57:15+00:00   
4  1166752233305886721  2019-08-28 16:39:44+00:00   

                                               tweet    name  
0  b'RT @OSDONOSDABOLA: O Craque @10neto j\xc3\xa...  10neto  
1  b'@plihalespn Vc t\xc3\xa1 certo \xf0\x9f\x97\...  10neto  
2  b'Parab\xc3\xa9ns ao @Gremio pelos 117 anos de...  10neto  
3  b'Inacredit\xc3\xa1vel o p\xc3\xaanalti que ma...  10neto  
4  b'RT @OSDONOSDABOLA: A\xc3\xad galera, vamos v...  10neto  


In [20]:
final_df['name'].unique()

array(['10neto', '10Ronaldinho', '143redangel', '1LoganHenderson',
       '1victorvaldes', '21LVA', '3gerardpique', '4everBrandy', '50cent',
       '5SOS', '6BillionPeople', '7sainaljassmi', 'aarbeloa17',
       'aaronpaul_8', 'ABdeVilliers17', 'abdulrahman', 'ActorLeeMinHo',
       'ActuallyNPH', 'acunilicali', 'AdalRamones', 'adamlambert',
       'adamlevine', 'AdamSchefter', 'AdelAliBinAli', 'Adela_Micha',
       'Adele', 'AdnanAlarour', 'agnezmo', 'aguerosergiokun', 'ahickmann',
       'ahmethc', 'AHMTKURAL', 'ajaydevgn', 'Akon', 'akshaykumar',
       'AlbertoCiurana', 'AlejandroSanz', 'alexoficial', 'AlfredoFlores',
       'aliaa08', 'aliciakeys', 'alinebarros', 'AllRiseSilver',
       'alo_oficial', 'AlvaroMorata', 'Alwaleed_Talal', 'alyankovic',
       'Alyssa_Milano', 'amandabynes', 'amrdiab', 'amrkhaled', 'amrwaked',
       'amyschumer', 'Anahi', 'anakarylle', 'ANAMARIABRAGA',
       'anandmahindra', 'andersoncooper', 'AndreaLegarreta',
       'AndreaSernaTV', 'andreolifelipe'

In [21]:
final_df.to_csv("D:/Projects/Personal/Hackathon/TwinSpherev2/Data/CelebrityTweets.csv")

In [22]:
celebrity_tweets = pd.read_csv("D:/Projects/Personal/Hackathon/TwinSpherev2/Data/CelebrityTweets.csv")

In [26]:
celebrity_tweets.head()

Unnamed: 0.1,Unnamed: 0,twitter_id,date,tweet,name
0,0,1265675160813780996,2020-05-27 16:04:08+00:00,b'RT @OSDONOSDABOLA: O Craque @10neto j\xc3\xa...,10neto
1,1,1352116023261097987,2021-01-21 04:49:15+00:00,b'@plihalespn Vc t\xc3\xa1 certo \xf0\x9f\x97\...,10neto
2,2,1305913958898053121,2020-09-15 16:58:45+00:00,b'Parab\xc3\xa9ns ao @Gremio pelos 117 anos de...,10neto
3,3,1515766356548984835,2022-04-17 18:57:15+00:00,b'Inacredit\xc3\xa1vel o p\xc3\xaanalti que ma...,10neto
4,4,1166752233305886721,2019-08-28 16:39:44+00:00,"b'RT @OSDONOSDABOLA: A\xc3\xad galera, vamos v...",10neto


In [28]:
# Group by name and sample + concatenate
grouped = (
    celebrity_tweets.groupby("name")["tweet"]
    .apply(lambda x: "<ENDOFTWEET>".join(x.sample(n=min(100, len(x)), random_state=42)))
    .reset_index(name="concatenated_tweets")
)

In [29]:
grouped.head()

Unnamed: 0,name,concatenated_tweets
0,10Ronaldinho,b'Baita recep\xc3\xa7\xc3\xa3o dessa galera an...
1,10neto,b'@andreolifelipe A\xc3\xad brigam com a gente...
2,143redangel,b'Julie Yap-Daza = huli (To get caught). \n\ne...
3,1LoganHenderson,"b""Check this out! @LOWDNoizez' debut single of..."
4,1victorvaldes,b'\xd8\xb4\xd8\xb1\xd9\x83\xd8\xa9 \xd8\xa7\xd...


In [30]:
# Save to a new CSV (optional)
grouped.to_csv("grouped_concatenated_tweets.csv", index=False)