In [30]:
# imports

import os
import pandas as pd
import pyarrow as pyarrow
import regex as re
import langid
from pyarrow import feather
from datetime import datetime
from LanguageDetection import LanguageDetection as lang
from nltk.corpus import stopwords
from nltk.tokenize import WhitespaceTokenizer

---
## Load Data

In [31]:
# load data and drop obsolete columns

df_twitter_old = pd.read_csv('twitter_res/twitter_sentiment_data.csv')
df_twitter_old.drop(columns=['sentiment'], inplace=True)
df_twitter_old.rename(columns={'message':'tweet', 'tweetid':'tweetsid'}, inplace=True)

df_twitter_new = pd.read_csv('twitter_res/climate_change_tweets.csv')
df_twitter_new.drop(columns=['hashtag'], inplace=True)

In [32]:
# concat

#df_twitter = pd.concat([df_twitter_old, df_twitter_new])
#df_twitter = df_twitter.reset_index()
#df_twitter = df_twitter.head(500)

#df_twitter.info()

---
## Clean Data

**General cleaning (remove stopwords and transform to lowercase)**

In [33]:
def general_cleaning(tweet):
    tokens = tk.tokenize(tweet.encode('ascii', errors='ignore').decode())
    tweet_tmp = ""

    for token in tokens:
        token_low = token.lower()
        if token_low not in stop_words:
            tweet_tmp += token_low + ' '

    return tweet_tmp

stop_words = set(stopwords.words('english'))
stop_words.add('rt')
stop_words.add('htt')
stop_words.add('https:')

tk = WhitespaceTokenizer()

df_twitter_old["tweet"] = df_twitter_old["tweet"].apply((lambda tweet: general_cleaning(tweet)))
df_twitter_new["tweet"] = df_twitter_new["tweet"].apply((lambda tweet: general_cleaning(tweet)))

**Remove URLs and users marked with @**

In [34]:
def remove_url(tweet):
    return re.sub(r'https?://\S+|www\.\S+', r"", tweet)


def remove_user(tweet):
    return re.sub(r'@[^\s]+', r"", tweet)

# Remove @ and urls
df_twitter_old["tweet"] = df_twitter_old["tweet"].apply((lambda tweet: remove_url(tweet)))
df_twitter_old["tweet"] = df_twitter_old["tweet"].apply((lambda tweet: remove_user(tweet)))

df_twitter_new["tweet"] = df_twitter_new["tweet"].apply((lambda tweet: remove_url(tweet)))
df_twitter_new["tweet"] = df_twitter_new["tweet"].apply((lambda tweet: remove_user(tweet)))

**Drop Duplicates**

In [35]:
df_twitter_old.drop_duplicates(subset='tweet', inplace=True)
df_twitter_new.drop_duplicates(subset='tweet', inplace=True)

**Drop rows, where the tweet is not in English**

In [36]:
def detect_language(tweet):
   # language = lang.LanguageDetection().LanguageDetect(tweet)
   # language = LanguageIdentifier.from_modelstring(model, norm_probs=True)
    pred = langid.classify(tweet)
    if pred[0] != "en":
        return False
    else:
        return True
# Filtering language
to_drop_old = []
df_twitter_old['tweet'] = df_twitter_old['tweet'].apply(lambda tweet: tweet if (detect_language(tweet)) else to_drop_old.append(
    df_twitter_old[df_twitter_old["tweet"] == tweet].index[0]))
df_twitter_old = df_twitter_old.drop(to_drop_old)

to_drop_new = []
df_twitter_new['tweet'] = df_twitter_new['tweet'].apply(lambda tweet: tweet if (detect_language(tweet)) else to_drop_new.append(
    df_twitter_new[df_twitter_new["tweet"] == tweet].index[0]))
df_twitter_new = df_twitter_new.drop(to_drop_new)

---
## Other

**Add column with date encoding the ID from the tweets**
Source: https://github.com/oduwsdl/tweetedat


In [37]:
def get_timestamp(id):
    time = datetime.utcfromtimestamp(((id >> 22) + 1288834974657) / 1000)
    return time.strftime("%d-%m-%Y")

df_twitter_old['tweet_ts'] = df_twitter_old['tweetsid'].apply(lambda id: get_timestamp(id))
df_twitter_old = df_twitter_old.drop(columns="tweetsid")

df_twitter_new['tweet_ts'] = df_twitter_new['tweetsid'].apply(lambda id: get_timestamp(id))
df_twitter_new = df_twitter_new.drop(columns="tweetsid")


**Add column with hashtags**

In [38]:
def get_hashtags(tweet):
    return re.findall('#\w+', tweet)

df_twitter_old['hashtags']= df_twitter_old['tweet'].apply(lambda tweet: get_hashtags(tweet))
df_twitter_new['hashtags']= df_twitter_new['tweet'].apply(lambda tweet: get_hashtags(tweet))

In [39]:
df_twitter_old

Unnamed: 0,tweet,tweet_ts,hashtags
0,climate change interesting hustle global warm...,31-10-2016,[]
1,"watch #beforetheflood right here, travels wo...",31-10-2016,[#beforetheflood]
2,fabulous! leonardo #dicaprio's film #climate c...,31-10-2016,"[#dicaprio, #climate]"
3,watched amazing documentary leonardodicaprio ...,31-10-2016,[]
4,"pranita biswasi, lutheran odisha, gives testi...",31-10-2016,[]
...,...,...,...
43934,#awareness walls aren$q$t answer people fleein...,26-10-2016,[#awareness]
43935,americans scared clowns climate change.,26-10-2016,[]
43939,respective parties prevent climate change glob...,26-10-2016,[#zpndebate]
43941,still can$q$t believe gif taehyung saved huma...,26-10-2016,[]


In [40]:
df_twitter_new

Unnamed: 0,tweet,tweet_ts,hashtags
0,lawmakers justify protecting old growth forest...,11-11-2022,"[#scicomm, #carbon, #climatechange]"
1,evolution plant drought strategies herbivore t...,28-04-2022,[#climatechange]
2,#conservation #climatecrisis #climateaction #r...,14-03-2022,"[#conservation, #climatecrisis, #climateaction..."
3,"winner year's photography4humanity prize, ""a t...",04-01-2022,[#climateaction]
4,"blame shifting, peak passivity, lies #climatec...",10-03-2022,"[#climatecrisis, #scottydoesnothing]"
...,...,...,...
2363325,world temperature anomalies 17.03.2023 76 /76 ...,17-03-2023,"[#arctic, #africa, #russia, #climatecrisis, #c..."
2363326,louisiana 8 worst water-polluting refineries c...,31-01-2023,"[#climatechange, #oceans, #oceanpullution, #po..."
2363328,"free webinar: electric vehicles 101, january 3...",26-01-2023,"[#evs, #electricvehicles, #vehicles, #cars, #p..."
2363329,#worldozoneday #saveearth #gogreen #savesoil ...,16-09-2022,"[#worldozoneday, #saveearth, #gogreen, #saveso..."


**Write twitter dataframe to binary Feather format**

In [46]:
def write_to_feather(df, name):
    root_dir = os.path.dirname(os.path.abspath('twitter_preprocessing.ipynb'))
    path = os.path.join(root_dir, 'twitter_res/' + name)
    feather.write_feather(df, path)

# read: df = pd.read_feather(path);

In [47]:
write_to_feather(df_twitter_old, 'twitter_old.ftr')
write_to_feather(df_twitter_new, 'twitter_new.ftr')