In [1]:
import requests as requests
import snscrape.modules.twitter as sntwitter
import pandas as pd
import re
import torch
import itertools
import wget
import sys
sys.setrecursionlimit(10000)

from transformers import AutoTokenizer, AutoModelForSequenceClassification
from scipy.special import softmax
from nltk.stem import WordNetLemmatizer

## Twitter Scraping

In [2]:
# scrape twitter data
query = 'cocaine lang:nl until:2022-12-31 since:2013-01-01'
tweets = []
count = 0

for tweet in sntwitter.TwitterSearchScraper(query).get_items():
    count += 1
    if(count == 1000):
        count = 0
        print(tweet.date)
    
    # Split the data and time for easier access later
    datetime = str(tweet.date).split()
    tweetdate = datetime[0]
    tweetyear = tweetdate[:4]
    tweettime = datetime[1]
    tweets.append([tweetyear, tweetdate, tweettime, tweet.user.username, (tweet.content).replace('\n', ' '), tweet.replyCount, tweet.retweetCount, tweet.likeCount, tweet.quoteCount, tweet.hashtags, tweet.sourceLabel]) 

# Select relevant data columns
drugs_df = pd.DataFrame(tweets, columns=['Year', 'Date', 'Time', 'User', 'Tweet', 'Replies', 'Retweets', 'Likes', 'Quotes',  'tweet.hashtags', 'sourceLabel'])

drugs_df.head()

2022-12-10 07:41:59+00:00
2022-11-21 10:01:53+00:00
2022-11-05 22:20:42+00:00
2022-10-17 23:30:15+00:00
2022-10-07 12:44:31+00:00
2022-09-23 09:54:59+00:00
2022-09-10 17:00:07+00:00
2022-09-03 13:21:13+00:00
2022-08-20 07:40:12+00:00
2022-07-26 15:04:48+00:00
2022-07-08 05:38:47+00:00
2022-06-11 18:10:16+00:00
2022-05-16 07:34:49+00:00
2022-04-22 15:20:59+00:00
2022-03-24 13:06:48+00:00
2022-02-23 13:26:44+00:00
2022-02-01 14:37:10+00:00
2022-01-07 16:53:51+00:00
2021-12-14 15:33:37+00:00
2021-11-18 06:20:43+00:00
2021-10-20 16:38:54+00:00
2021-09-25 21:25:09+00:00
2021-08-25 15:57:33+00:00
2021-07-29 16:18:38+00:00
2021-07-07 20:02:41+00:00
2021-06-03 18:30:24+00:00
2021-05-07 18:51:09+00:00
2021-04-01 07:37:56+00:00
2021-03-02 11:08:49+00:00
2021-01-27 22:21:24+00:00
2020-12-29 10:15:55+00:00
2020-11-30 13:48:09+00:00
2020-11-01 14:37:21+00:00
2020-10-06 20:43:32+00:00
2020-09-05 13:44:18+00:00
2020-08-11 09:59:16+00:00
2020-07-15 21:25:37+00:00
2020-06-20 06:03:23+00:00
2020-05-22 1

Unnamed: 0,Year,Date,Time,User,Tweet,Replies,Retweets,Likes,Quotes,tweet.hashtags,sourceLabel
0,2022,2022-12-30,23:29:07+00:00,krippiecool,@PietervVol Had ook cocaïne kunnen zijn . Per...,0,0,0,0,,Twitter for Android
1,2022,2022-12-30,22:27:09+00:00,timgroothuis,@peterkwint Fun fact: de poedersuiker die er o...,0,0,1,0,,Twitter for iPhone
2,2022,2022-12-30,22:01:18+00:00,serranosuner,@Mauritsvdr @_nadianaji Poetin is dan ook geen...,0,0,0,0,,Twitter for iPad
3,2022,2022-12-30,21:31:44+00:00,BramGobbel,@landmarksnl @Woutbucker @VeldKirst13 Die ande...,1,0,0,0,,Twitter for Android
4,2022,2022-12-30,21:27:52+00:00,Erik27793940,Sophie en Tim aan de cocaïne #Top2000agogo,0,0,4,0,[Top2000agogo],Twitter for iPhone


In [5]:
# add week to data
drugs_df['Date'] = pd.to_datetime(drugs_df['Date'], errors='coerce')
drugs_df['week'] = drugs_df.Date.apply(lambda x: x.strftime('%U')).astype(int)

drugs_df.head(2)

Unnamed: 0,Year,Date,Time,User,Tweet,Replies,Retweets,Likes,Quotes,tweet.hashtags,sourceLabel,week
0,2022,2022-12-30,23:29:07+00:00,krippiecool,@PietervVol Had ook cocaïne kunnen zijn . Per...,0,0,0,0,,Twitter for Android,52
1,2022,2022-12-30,22:27:09+00:00,timgroothuis,@peterkwint Fun fact: de poedersuiker die er o...,0,0,1,0,,Twitter for iPhone,52


## Save datasets

In [6]:
# save dataset
drugs_df.to_csv("datasets/tweets_corpus.csv")

## Dataset for dashboard

In [102]:
# absolute values
absolute = drugs_df.groupby(['Year', 'week'])['Tweet'].count()
absolute

Year  week
2013  0       372
      1       747
      2       643
      3       434
      4       559
             ... 
2022  48      358
      49      367
      50      395
      51      372
      52      203
Name: Tweet, Length: 530, dtype: int64

In [103]:
absolute.to_csv("datasets/absolute_tweet_values_cocaine.csv")

In [110]:
# apply grouping
average_per_year = drugs_df.groupby(['Year'])['week'].count() / 52
tweets_per_week = drugs_df.groupby(['Year', 'week'])['Tweet'].count()
max_tweets_per_year = tweets_per_week.groupby(level=0).apply(max)
min_tweets_per_year = tweets_per_week.groupby(level=0).apply(min)

# make relative
relative_tweets_per_week = tweets_per_week / average_per_week
max_relative_tweets_per_year = relative_tweets_per_week.groupby(level=0).apply(max)

# normalize
relative = relative_tweets_per_week / max_relative_tweets_per_year * 100

In [105]:
relative.to_csv("datasets/relative_tweet_values_cocaine.csv")

In [93]:
dashboard_df.to_csv("datasets/tweets_per_week2022.csv")

## Data Cleaning

In [6]:
# This function aims to split all the text into tokens
def split_words_inclusive(text):
    if(type(text) != str):
        return []
    
    # fetch alphabetic characters
    text = re.sub("[^a-zA-Z@#]", " ", text)

    # convert text to lower case
    text = text.lower()

    # split text into tokens to remove whitespaces
    tokens = text.split()

    return tokens

# This function uses a Gensim function to remove stopwords 
# and eliminates some words which have no meaning based on manual inspection
lemmatizer = WordNetLemmatizer()
def remove_stopwords(text):
    tokens = split_words_inclusive(text)
    text = " ".join([lemmatizer.lemmatize(token) for token in tokens])
    text = remove_stopwords(text)
    
    # Words specifically removed because of manual inspection
    text = text.replace(" amp ", "")
    text = text.replace(" t ", "")
    text = text.replace(" s ", "")
    text = text.replace(" http ", "")
    text = text.replace(" m ", "")
    
    tokens = text.split()
    return tokens

#This function splits the text into tokens and lammatizes it so it is suited for the Roberta model
def pre_process(text):
    if(type(text) != str):
        return ""
    
    # fetch alphabetic characters
    text = re.sub("[^a-zA-Z]", " ", text)

    # convert text to lower case
    text = text.lower()

    # split text into tokens to remove whitespaces
    tokens = text.split()

    return " ".join([lemmatizer.lemmatize(token) for token in tokens])

# This function determines whether the word cocaine is contained in a tweet.

def filter_drug(text):
    if(type(text) != str):
        return 0
    
    # fetch text  including tesla
    if "cocaine" in text or "cocaine" in text:
        return 1
    return 0

In [7]:
# apply function to clean data
# df_cocaine_clean = df_cocaine.copy()
# df_cocaine_clean['Tweet'] = df_cocaine['content'].apply(split_words_inclusive)
# df_cocaine_clean['Tweet no stopwords'] = df_cocaine_clean['Tweet'].apply(remove_stopwords)
# df_cocaine_clean['Clean tweet'] = df_cocaine_clean['content'].apply(pre_process)
