In [1]:
# imports

import os
import pandas as pd
import pyarrow as pyarrow
import regex as re
import langid
from datetime import datetime
from LanguageDetection import LanguageDetection as lang
from nltk.corpus import stopwords
from nltk.tokenize import WhitespaceTokenizer

---
## Load Data

In [2]:
# load data and drop obsolete columns

df_twitter_old = pd.read_csv('twitter_res/twitter_sentiment_data.csv')
df_twitter_old.drop(columns=['sentiment'], inplace=True)
df_twitter_old.rename(columns={'message':'tweet', 'tweetid':'tweetsid'}, inplace=True)

df_twitter_new = pd.read_csv('twitter_res/climate_change_tweets.csv')
df_twitter_new.drop(columns=['hashtag'], inplace=True)

In [3]:
# concat

df_twitter = pd.concat([df_twitter_old, df_twitter_new])
df_twitter = df_twitter.reset_index()
df_twitter = df_twitter.head(500)

#df_twitter.info()

---
## Clean Data

**General cleaning (remove stopwords and transform to lowercase)**

In [4]:
def general_cleaning(tweet):
    tokens = tk.tokenize(tweet.encode('ascii', errors='ignore').decode())
    tweet_tmp = ""

    for token in tokens:
        token_low = token.lower()
        if token_low not in stop_words:
            tweet_tmp += token_low + ' '

    return tweet_tmp

stop_words = set(stopwords.words('english'))
stop_words.add('rt')
stop_words.add('htt')

tk = WhitespaceTokenizer()

df_twitter["tweet"] = df_twitter["tweet"].apply((lambda tweet: general_cleaning(tweet)))

**Remove URLs and users marked with @**

In [5]:
def remove_url(tweet):
    return re.sub(r'https?://\S+|www\.\S+', r"", tweet)


def remove_user(tweet):
    return re.sub(r'@[^\s]+', r"", tweet)

# Remove @ and urls
df_twitter["tweet"] = df_twitter["tweet"].apply((lambda tweet: remove_url(tweet)))
df_twitter["tweet"] = df_twitter["tweet"].apply((lambda tweet: remove_user(tweet)))

**Drop Duplicates**

In [6]:
df_twitter.drop_duplicates(subset='tweet', inplace=True)

**Drop rows, where the tweet is not in English**

In [7]:
def detect_language(tweet):
   # language = lang.LanguageDetection().LanguageDetect(tweet)
   # language = LanguageIdentifier.from_modelstring(model, norm_probs=True)
    pred = langid.classify(tweet)
    if pred[0] != "en":
        return False
    else:
        return True
# Filtering language
to_drop = []
df_twitter['tweet'] = df_twitter['tweet'].apply(lambda tweet: tweet if (detect_language(tweet)) else to_drop.append(
    df_twitter[df_twitter["tweet"] == tweet].index[0]))
df_twitter = df_twitter.drop(to_drop)

---
## Other

**Add column with date encoding the ID from the tweets**
Source: https://github.com/oduwsdl/tweetedat


In [8]:
def get_timestamp(id):
    time = datetime.utcfromtimestamp(((id >> 22) + 1288834974657) / 1000)
    return time.strftime("%d-%m-%Y")

df_twitter['tweet_ts'] = df_twitter['tweetsid'].apply(lambda id: get_timestamp(id))
df_twitter = df_twitter.drop(columns="tweetsid")


In [9]:
print(df_twitter)

     index                                              tweet    tweet_ts
0        0   climate change interesting hustle global warm...  31-10-2016
1        1   watch #beforetheflood right here,  travels wo...  31-10-2016
2        2  fabulous! leonardo #dicaprio's film #climate c...  31-10-2016
3        3   watched amazing documentary leonardodicaprio ...  31-10-2016
4        4   pranita biswasi, lutheran odisha, gives testi...  31-10-2016
..     ...                                                ...         ...
494    494  cartoon: media covered climate change way cove...  01-11-2016
496    496   effect climate change food chain huge  #anima...  01-11-2016
497    497   youre enjoying warm weather lowkey know globa...  01-11-2016
498    498   appetite oil gas continue grow despite effort...  01-11-2016
499    499   via  cartoon: media covered climate change wa...  01-11-2016

[281 rows x 3 columns]


**Add column with hashtags**

In [10]:
def get_hashtags(tweet):
    return re.findall('#\w+', tweet)

df_twitter['hashtags']= df_twitter['tweet'].apply(lambda tweet: get_hashtags(tweet))

**Write twitter dataframe to binary Feather format**

In [None]:
def write_to_feather():
    root_dir = os.path.dirname(os.path.abspath('twitter_preprocessing.ipynb'))
    path = os.path.join(root_dir, 'twitter_res/twitter.ftr')
    pyarrow.feather.write_feather(df_twitter, path)

# read: df_twitter = pd.read_feather(path);