In [2]:
import requests as requests
import snscrape.modules.twitter as sntwitter
import pandas as pd
import re
import torch
import itertools
import wget
import sys
sys.setrecursionlimit(10000)

from transformers import AutoTokenizer, AutoModelForSequenceClassification
from scipy.special import softmax
from nltk.stem import WordNetLemmatizer

## Twitter Scraping

In [3]:
# scrape twitter data
query = 'ghb lang:nl until:2022-12-31 since:2013-01-01'
tweets = []
count = 0

for tweet in sntwitter.TwitterSearchScraper(query).get_items():
    count += 1
    if(count == 1000):
        count = 0
        print(tweet.date)
    
    # Split the data and time for easier access later
    datetime = str(tweet.date).split()
    tweetdate = datetime[0]
    tweetyear = tweetdate[:4]
    tweettime = datetime[1]
    tweets.append([tweetyear, tweetdate, tweettime, tweet.user.username, (tweet.content).replace('\n', ' '), tweet.replyCount, tweet.retweetCount, tweet.likeCount, tweet.quoteCount, tweet.hashtags, tweet.sourceLabel]) 

# Select relevant data columns
drugs_df = pd.DataFrame(tweets, columns=['Year', 'Date', 'Time', 'User', 'Tweet', 'Replies', 'Retweets', 'Likes', 'Quotes',  'tweet.hashtags', 'sourceLabel'])

drugs_df.head()

2022-05-08 18:38:18+00:00
2021-12-28 10:59:56+00:00
2021-06-16 08:11:32+00:00
2020-11-02 07:46:47+00:00
2020-01-27 17:49:33+00:00
2019-08-07 09:50:41+00:00
2019-06-21 09:23:48+00:00
2018-12-17 21:48:47+00:00
2018-09-05 08:09:38+00:00
2017-10-11 08:15:58+00:00
2017-01-12 15:55:00+00:00
2016-05-24 19:10:08+00:00
2016-01-28 04:45:02+00:00
2015-08-18 16:14:08+00:00
2015-05-03 22:26:31+00:00
2015-03-03 11:49:15+00:00
2014-11-07 09:37:43+00:00
2014-07-24 21:44:47+00:00
2014-04-11 07:43:40+00:00
2013-12-27 20:32:03+00:00
2013-11-14 20:22:27+00:00
2013-09-02 20:59:07+00:00
2013-07-09 09:00:13+00:00
2013-04-09 14:12:18+00:00
2013-02-08 18:00:42+00:00


Unnamed: 0,Year,Date,Time,User,Tweet,Replies,Retweets,Likes,Quotes,tweet.hashtags,sourceLabel
0,2022,2022-12-30,19:01:35+00:00,x_mitsuyozo,@shinlune Dios no puedomcon tanhta emocionbn g...,1,0,0,0,,Twitter Web App
1,2022,2022-12-30,16:22:59+00:00,Pear78tv,@duifmeneerhoe @MrsKrass @BestOf1913 @migratio...,1,0,0,0,,Twitter Web App
2,2022,2022-12-30,15:02:11+00:00,psoixantehuite,@derpmas14881477 @POL_Rotterdam Zo te lezen mo...,1,0,0,0,,Twitter for iPhone
3,2022,2022-12-29,18:54:00+00:00,ShitShowForLife,@19961995sw @de_Kapitein_ Vent kijk waar twitt...,0,0,1,0,,Twitter for Android
4,2022,2022-12-29,14:39:16+00:00,Treech29,@Dylanfeyenoord1 Van alcohol gaan net zo veel ...,1,0,0,0,,Twitter for iPhone


In [22]:
# add week to data
drugs_df['Date'] = pd.to_datetime(drugs_df['Date'], errors='coerce')
drugs_df['week'] = drugs_df.Date.apply(lambda x: x.strftime('%U')).astype(int)

drugs_df.head(2)

Unnamed: 0,Year,Date,Time,User,Tweet,Replies,Retweets,Likes,Quotes,tweet.hashtags,sourceLabel,week
0,2022,2022-12-30,19:01:35+00:00,x_mitsuyozo,@shinlune Dios no puedomcon tanhta emocionbn g...,1,0,0,0,,Twitter Web App,52
1,2022,2022-12-30,16:22:59+00:00,Pear78tv,@duifmeneerhoe @MrsKrass @BestOf1913 @migratio...,1,0,0,0,,Twitter Web App,52


## Save datasets

In [5]:
# save dataset
drugs_df.to_csv("datasets/tweets_corpus_ghb.csv")

## Dataset for dashboard

In [6]:
# absolute values
absolute = drugs_df.groupby(['Year', 'week'])['Tweet'].count()
absolute

Year  week
2013  0        41
      1        97
      2        89
      3       279
      4       154
             ... 
2022  48       21
      49       32
      50       22
      51       23
      52       20
Name: Tweet, Length: 530, dtype: int64

In [7]:
absolute.to_csv("datasets/absolute_tweet_values_ghb.csv")

In [8]:
# apply grouping
average_per_year = drugs_df.groupby(['Year'])['week'].count() / 52
tweets_per_week = drugs_df.groupby(['Year', 'week'])['Tweet'].count()
max_tweets_per_year = tweets_per_week.groupby(level=0).apply(max)
min_tweets_per_year = tweets_per_week.groupby(level=0).apply(min)

# make relative
relative_tweets_per_week = tweets_per_week / average_per_year
max_relative_tweets_per_year = relative_tweets_per_week.groupby(level=0).apply(max)

# normalize
relative = relative_tweets_per_week / max_relative_tweets_per_year * 100

In [9]:
relative.to_csv("datasets/relative_tweet_values_ghb.csv")

## Data Cleaning

In [6]:
# This function aims to split all the text into tokens
def split_words_inclusive(text):
    if(type(text) != str):
        return []
    
    # fetch alphabetic characters
    text = re.sub("[^a-zA-Z@#]", " ", text)

    # convert text to lower case
    text = text.lower()

    # split text into tokens to remove whitespaces
    tokens = text.split()

    return tokens

# This function uses a Gensim function to remove stopwords 
# and eliminates some words which have no meaning based on manual inspection
lemmatizer = WordNetLemmatizer()
def remove_stopwords(text):
    tokens = split_words_inclusive(text)
    text = " ".join([lemmatizer.lemmatize(token) for token in tokens])
    text = remove_stopwords(text)
    
    # Words specifically removed because of manual inspection
    text = text.replace(" amp ", "")
    text = text.replace(" t ", "")
    text = text.replace(" s ", "")
    text = text.replace(" http ", "")
    text = text.replace(" m ", "")
    
    tokens = text.split()
    return tokens

#This function splits the text into tokens and lammatizes it so it is suited for the Roberta model
def pre_process(text):
    if(type(text) != str):
        return ""
    
    # fetch alphabetic characters
    text = re.sub("[^a-zA-Z]", " ", text)

    # convert text to lower case
    text = text.lower()

    # split text into tokens to remove whitespaces
    tokens = text.split()

    return " ".join([lemmatizer.lemmatize(token) for token in tokens])

# This function determines whether the word cocaine is contained in a tweet.

def filter_drug(text):
    if(type(text) != str):
        return 0
    
    # fetch text  including tesla
    if "cocaine" in text or "cocaine" in text:
        return 1
    return 0

In [7]:
# apply function to clean data
# df_cocaine_clean = df_cocaine.copy()
# df_cocaine_clean['Tweet'] = df_cocaine['content'].apply(split_words_inclusive)
# df_cocaine_clean['Tweet no stopwords'] = df_cocaine_clean['Tweet'].apply(remove_stopwords)
# df_cocaine_clean['Clean tweet'] = df_cocaine_clean['content'].apply(pre_process)
