In [1]:
import os
import re
import pandas as pd
pd.options.mode.chained_assignment = None 

In [2]:
def merge_data(path):
    folders = os.listdir(path)
    data = []
    for folder in folders:
        files = os.listdir(path+folder)
        date = []
        for file in files:
            try:
                df_file = pd.read_csv((path+folder+'/'+file), index_col=None, header=0)
                date.append(df_file)
            except pd.errors.EmptyDataError:
                    continue
        df_date = pd.concat(date, axis=0, ignore_index=True) 
        data.append(df_date)
        
        df = pd.concat(data, axis=0, ignore_index=True)
        df = df.drop_duplicates()
        df = df[df['language']=='en']
    return df

def assign_tidyTweet(s):
    url_pattern = re.compile(r'https?://\S+')
    no_url = url_pattern.sub(r'', str(s))
    username_pattern = re.compile(r'@\w+')
    no_username = username_pattern.sub(r'', no_url)
    amp_pattern = re.compile(r'&amp;')
    tidy_tweet = amp_pattern.sub(r'and', no_username)
    return tidy_tweet

def drop_hashtag(s):
    tidy_tweet = assign_tidyTweet(s)
    hashtag_pattern = re.compile(r'#\w+')
    no_hashtag = hashtag_pattern.sub(r'', tidy_tweet)
    return no_hashtag

def process_and_save(path, name):
    df = merge_data(path)
    df['no_hashtag'] = df['tweet'].apply(drop_hashtag)
    df = df[df['no_hashtag'].str.len()!=0]
    df = df.drop_duplicates(subset=['no_hashtag'], keep='first')
    
    df_tweets = df[['id','date', 'search' 'tweet']]
    df_tweets['tidyTweet'] = df_tweets['tweet'].apply(assign_tidyTweet)
    return df_tweets.to_csv(name+'.csv', index=False)

In [3]:
process_and_save('./conspiracy_theories_data/', 'conspiracy_theories_data')
process_and_save('./covid_data/', 'test_covid_data')
process_and_save('./truckers_data/', 'test_truckers_data')