In [12]:
import pandas as pd
from tqdm import tqdm

In [13]:
DATA_FOLDER = 'data'

In [14]:
# load tweets from sanders dataset
def load_sanders_tweets(filename: str = 'sanders_corpus.csv') -> pd.DataFrame:
    filepath = f"{DATA_FOLDER}/{filename}"
    df = pd.read_csv(filepath)
    df.drop(columns=['TweetId', 'Sentiment'], inplace=True)
    df.dropna(inplace=True)
    df['TweetDate'] = pd.to_datetime(df['TweetDate'])
    return df

In [15]:
# load tweets from celebrities
def load_celebrity_tweets(filename: str = 'celebrity_tweets.csv') -> pd.DataFrame:
    column_names = ['user', 'tweet', 'sentiment_label']
    df = pd.read_csv(f'{DATA_FOLDER}/{filename}', header=None, names=column_names)
    df.drop(columns=['sentiment_label'], inplace=True)
    df.dropna(inplace=True)
    return df

In [16]:
def load_cikm_tweets(filename: str = 'cikm_2010_tweets.txt') -> pd.DataFrame:
    # helper to process a single file line
    def process_line(line):
        parts = line.strip().split('\t')
        if len(parts) != 4:
            return None
        user_id, tweet_id, tweet, created_at = parts
        # return {'Tweet': tweet, 'CreatedAt': created_at}
        return (tweet, created_at)
    
    # read file and process each line:
    file_lines = open(f'{DATA_FOLDER}/{filename}').readlines()
    results = []
    for line in tqdm(file_lines):
        result = process_line(line)
        if result:results.append(result)
    
    df = pd.DataFrame(results, columns=['Tweet', 'CreatedAt'])
    # read created at column as datetime, and drop any row with invalid values:
    df['CreatedAt'] = pd.to_datetime(df['CreatedAt'], errors='coerce')
    df.dropna(inplace=True)
    return df

In [17]:
def load_kaggle_tweets(filename: str = 'kaggle_tweets.csv') -> pd.DataFrame:
    column_names = ['sentiment', 'tweet_id', 'date', 'query_status', 'username', 'text']
    # helper to process a single file line
    def process_line(line):
        parts = line.strip().split('","')
        parts = list(map(lambda x: x.replace('"', ''), parts))
        if len(parts) != len(column_names):
            return None
        return parts
    
    # read file and process each line:
    file_lines = open(f'{DATA_FOLDER}/{filename}', encoding='latin-1').readlines()
    results = []
    for line in tqdm(file_lines):
        result = process_line(line)
        if result:results.append(result)
    
    df = pd.DataFrame(results, columns=column_names)
    # read created at column as datetime, and drop any row with invalid values:
    df['date'] = pd.to_datetime(df['date'], errors='coerce')
    df.dropna(inplace=True)
    return df

In [18]:
def load_dataset(name: str) -> pd.DataFrame:
    func_mapping = {
        'sanders': load_sanders_tweets,
        'celebrity': load_celebrity_tweets,
        'cikm': load_cikm_tweets,
        'kaggle': load_kaggle_tweets
    }
    assert name in func_mapping, f"Dataset {name} not found."
    return func_mapping[name]()

In [20]:
if __name__ == '__main__':
    datasets = ['sanders', 'celebrity', 'cikm', 'kaggle']
    for dataset in datasets:
        # print(f"Loading {dataset} tweets...")
        df = load_dataset(dataset)
        print(f"Loaded {len(df)} tweets from {dataset} dataset.")
        print(df.head())

  df['TweetDate'] = pd.to_datetime(df['TweetDate'])


Loaded 5113 tweets from sanders dataset.
   Topic                 TweetDate  \
0  apple 2011-10-18 21:53:25+00:00   
1  apple 2011-10-18 21:09:33+00:00   
2  apple 2011-10-18 21:02:20+00:00   
3  apple 2011-10-18 20:40:10+00:00   
4  apple 2011-10-18 20:34:00+00:00   

                                           TweetText  
0  Now all @Apple has to do is get swype on the i...  
1  @Apple will be adding more carrier support to ...  
2  Hilarious @youtube video - guy does a duet wit...  
3  @RIM you made it too easy for me to switch to ...  
4  I just realized that the reason I got into twi...  
Loaded 3215 tweets from celebrity dataset.
          user                                              tweet
0  BarackObama  Aretha helped define the American experience. ...
1  BarackObama  Bobby Kennedy was one of my heroes. He was som...
2  BarackObama  I’m confident that, together, they’ll strength...
3  BarackObama  Today I’m proud to endorse such a wide and imp...
4  BarackObama  Mandela Day

100%|██████████| 9001672/9001672 [00:09<00:00, 984628.83it/s] 


Loaded 8783191 tweets from cikm dataset.
                                               Tweet           CreatedAt
0  Ok today I have to find something to wear for ... 2010-03-15 17:35:58
1  I am glad I'm having this show but I can't wai... 2010-03-15 16:53:44
2  Honestly I don't even know what's going on any... 2010-03-15 16:52:59
3  @LovelyJ_Janelle hey sorry I'm sitting infront... 2010-03-15 15:42:07
4  Sitting infront of this sewing machine ... I d... 2010-03-15 13:55:22


100%|██████████| 1600000/1600000 [00:07<00:00, 217314.69it/s]
  df['date'] = pd.to_datetime(df['date'], errors='coerce')


Loaded 1600000 tweets from kaggle dataset.
  sentiment    tweet_id                date query_status         username  \
0         0  1467810369 2009-04-06 22:19:45     NO_QUERY  _TheSpecialOne_   
1         0  1467810672 2009-04-06 22:19:49     NO_QUERY    scotthamilton   
2         0  1467810917 2009-04-06 22:19:53     NO_QUERY         mattycus   
3         0  1467811184 2009-04-06 22:19:57     NO_QUERY          ElleCTF   
4         0  1467811193 2009-04-06 22:19:57     NO_QUERY           Karoli   

                                                text  
0  @switchfoot http://twitpic.com/2y1zl - Awww, t...  
1  is upset that he can't update his Facebook by ...  
2  @Kenichan I dived many times for the ball. Man...  
3    my whole body feels itchy and like its on fire   
4  @nationwideclass no, it's not behaving at all....  
