In [1]:
import requests as requests
import snscrape.modules.twitter as sntwitter
import pandas as pd
import re
import torch
import itertools
import wget
import sys
sys.setrecursionlimit(10000)

from transformers import AutoTokenizer, AutoModelForSequenceClassification
from scipy.special import softmax
from nltk.stem import WordNetLemmatizer

## Twitter Scraping

In [2]:
# scrape twitter data
query = 'xtc lang:nl until:2022-12-31 since:2013-01-01'
tweets = []
count = 0

for tweet in sntwitter.TwitterSearchScraper(query).get_items():
    count += 1
    if(count == 1000):
        count = 0
        print(tweet.date)
    
    # Split the data and time for easier access later
    datetime = str(tweet.date).split()
    tweetdate = datetime[0]
    tweetyear = tweetdate[:4]
    tweettime = datetime[1]
    tweets.append([tweetyear, tweetdate, tweettime, tweet.user.username, (tweet.content).replace('\n', ' '), tweet.replyCount, tweet.retweetCount, tweet.likeCount, tweet.quoteCount, tweet.hashtags, tweet.sourceLabel]) 

# Select relevant data columns
drugs_df = pd.DataFrame(tweets, columns=['Year', 'Date', 'Time', 'User', 'Tweet', 'Replies', 'Retweets', 'Likes', 'Quotes',  'tweet.hashtags', 'sourceLabel'])

drugs_df.head()

2022-09-27 10:54:09+00:00
2022-07-01 00:13:38+00:00
2022-03-20 09:12:33+00:00
2021-12-15 14:45:59+00:00
2021-09-20 18:15:05+00:00
2021-06-28 15:54:32+00:00
2021-02-24 18:01:05+00:00
2020-11-18 16:51:00+00:00
2020-07-13 06:02:22+00:00
2020-03-08 17:37:58+00:00
2020-01-06 15:45:09+00:00
2019-10-09 13:08:33+00:00
2019-08-20 05:07:38+00:00
2019-07-02 05:21:15+00:00
2019-04-08 18:01:52+00:00
2019-02-03 14:12:31+00:00
2018-12-18 16:43:07+00:00
2018-12-17 06:47:04+00:00
2018-10-22 13:39:07+00:00
2018-08-27 08:15:16+00:00
2018-05-11 12:22:05+00:00
2017-12-02 11:50:38+00:00
2017-06-25 14:12:56+00:00
2017-03-03 19:12:41+00:00
2016-10-21 08:26:18+00:00
2016-07-03 23:07:15+00:00
2016-04-21 10:14:00+00:00
2016-01-03 13:07:06+00:00
2015-10-09 20:44:34+00:00
2015-07-21 08:40:42+00:00
2015-05-28 15:52:01+00:00
2015-03-13 15:25:11+00:00
2015-01-23 19:05:55+00:00
2014-12-18 09:49:06+00:00
2014-10-22 11:49:20+00:00
2014-09-05 09:16:20+00:00
2014-06-03 11:14:36+00:00
2014-04-01 17:49:51+00:00
2014-01-19 1

Unnamed: 0,Year,Date,Time,User,Tweet,Replies,Retweets,Likes,Quotes,tweet.hashtags,sourceLabel
0,2022,2022-12-30,22:51:19+00:00,VirtueelGedoe,@Politie Tijd om sommige drugs uit de illegali...,0,0,0,0,,Twitter for Android
1,2022,2022-12-30,21:03:24+00:00,ErikTiel,"drug kopen in coffeeshops is geen probleem, xt...",1,1,2,0,,Twitter Web App
2,2022,2022-12-30,15:35:44+00:00,hesselink_ramon,Weet iemand trouwens dat het woord 'wappie' he...,8,8,44,0,,Twitter Web App
3,2022,2022-12-30,14:40:50+00:00,pzcredactie,"Hennepkewekerij, speed en xtc aangetroffen in ...",0,0,0,0,,Zapier.com
4,2022,2022-12-30,14:08:21+00:00,POL_Zeeland,In een woning aan de Kuipersdijk in #sHeerenho...,1,1,9,1,[sHeerenhoek],OBI4wan


In [3]:
# add week to data
drugs_df['Date'] = pd.to_datetime(drugs_df['Date'], errors='coerce')
drugs_df['week'] = drugs_df.Date.apply(lambda x: x.strftime('%U')).astype(int)

drugs_df.head(2)

Unnamed: 0,Year,Date,Time,User,Tweet,Replies,Retweets,Likes,Quotes,tweet.hashtags,sourceLabel,week
0,2022,2022-12-30,22:51:19+00:00,VirtueelGedoe,@Politie Tijd om sommige drugs uit de illegali...,0,0,0,0,,Twitter for Android,52
1,2022,2022-12-30,21:03:24+00:00,ErikTiel,"drug kopen in coffeeshops is geen probleem, xt...",1,1,2,0,,Twitter Web App,52


## Save datasets

In [4]:
# save dataset
drugs_df.to_csv("datasets/tweets_corpus_xtc.csv")

## Dataset for dashboard

In [5]:
# absolute values
absolute = drugs_df.groupby(['Year', 'week'])['Tweet'].count()
absolute

Year  week
2013  0       149
      1       287
      2       203
      3       158
      4       163
             ... 
2022  48       84
      49       48
      50       68
      51       55
      52       51
Name: Tweet, Length: 530, dtype: int64

In [6]:
absolute.to_csv("datasets/absolute_tweet_values_xtc.csv")

In [8]:
# apply grouping
average_per_year = drugs_df.groupby(['Year'])['week'].count() / 52
tweets_per_week = drugs_df.groupby(['Year', 'week'])['Tweet'].count()
max_tweets_per_year = tweets_per_week.groupby(level=0).apply(max)
min_tweets_per_year = tweets_per_week.groupby(level=0).apply(min)

# make relative
relative_tweets_per_week = tweets_per_week / average_per_year
max_relative_tweets_per_year = relative_tweets_per_week.groupby(level=0).apply(max)

# normalize
relative = relative_tweets_per_week / max_relative_tweets_per_year * 100

In [9]:
relative.to_csv("datasets/relative_tweet_values_xtc.csv")

## Data Cleaning

In [6]:
# This function aims to split all the text into tokens
def split_words_inclusive(text):
    if(type(text) != str):
        return []
    
    # fetch alphabetic characters
    text = re.sub("[^a-zA-Z@#]", " ", text)

    # convert text to lower case
    text = text.lower()

    # split text into tokens to remove whitespaces
    tokens = text.split()

    return tokens

# This function uses a Gensim function to remove stopwords 
# and eliminates some words which have no meaning based on manual inspection
lemmatizer = WordNetLemmatizer()
def remove_stopwords(text):
    tokens = split_words_inclusive(text)
    text = " ".join([lemmatizer.lemmatize(token) for token in tokens])
    text = remove_stopwords(text)
    
    # Words specifically removed because of manual inspection
    text = text.replace(" amp ", "")
    text = text.replace(" t ", "")
    text = text.replace(" s ", "")
    text = text.replace(" http ", "")
    text = text.replace(" m ", "")
    
    tokens = text.split()
    return tokens

#This function splits the text into tokens and lammatizes it so it is suited for the Roberta model
def pre_process(text):
    if(type(text) != str):
        return ""
    
    # fetch alphabetic characters
    text = re.sub("[^a-zA-Z]", " ", text)

    # convert text to lower case
    text = text.lower()

    # split text into tokens to remove whitespaces
    tokens = text.split()

    return " ".join([lemmatizer.lemmatize(token) for token in tokens])

# This function determines whether the word cocaine is contained in a tweet.

def filter_drug(text):
    if(type(text) != str):
        return 0
    
    # fetch text  including tesla
    if "cocaine" in text or "cocaine" in text:
        return 1
    return 0

In [7]:
# apply function to clean data
# df_cocaine_clean = df_cocaine.copy()
# df_cocaine_clean['Tweet'] = df_cocaine['content'].apply(split_words_inclusive)
# df_cocaine_clean['Tweet no stopwords'] = df_cocaine_clean['Tweet'].apply(remove_stopwords)
# df_cocaine_clean['Clean tweet'] = df_cocaine_clean['content'].apply(pre_process)
