# Pipeline workflow

## Requirements installation

In [4]:
# ! pip install -r ../requirements.txt
# ! pip install python-dotenv

## Load environment variables and set GSICrawler parameters

In [7]:
# Load Environment Variables
from dotenv import load_dotenv
load_dotenv()

# Set GSICrawler parameters
source = 'twitter'
gsicrawler_params = {"query": "AfD", "library": "snscrape"}
algorithms = ["liwc", "mft"]
before = '2022-02-16'
after = '2022-02-15'
ideology = True
geo = True
gsicrawler_host = "http://localhost:5000/api/v1" # or  os.environ['GSICRAWLER_URL']
senpy_host = "http://localhost:8007/api"
index="test"

## Import libraries

In [8]:
from tasks.scrapy import retrieveDataFromGSICrawler
from tasks.ideology import annotateIdeology
from tasks.language import detectLanguage
from tasks.preprocess import preprocessText
from tasks.geo import annotateGeolocation
from tasks.sentiment import sentimentAnalysis
from tasks.fuseki import removeNonSemanticFields, sendDataToFuseki

import requests
import time
import json
import re

from gsitk.preprocess import pprocess_twitter, normalize

import nltk
import string
from nltk import word_tokenize
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/chatops/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Get data from GSICrawler

This is the same request used in the Scrapy Task.

In [9]:
tweet_list = retrieveDataFromGSICrawler(gsicrawler_host, source, gsicrawler_params, before, after)
print(json.dumps(tweet_list, indent=3))

[
   {
      "@id": 1493736632259809282,
      "@type": [
         "schema:BlogPosting"
      ],
      "schema:about": "AfD",
      "schema:articleBody": "JAX issues Area Forecast Discussion (AFD) at Feb 15, 6:58 PM EST https://t.co/MfLAMFqxts",
      "schema:author": "twitter",
      "schema:creator": "iembot_jax",
      "schema:datePublished": "2022-02-15T23:59:00Z",
      "schema:headline": "JAX issues Area Forecast Discussion (AFD) at Feb 15, 6:58 PM EST https://t.co/MfLAMFqxts",
      "schema:inLanguage": "en",
      "schema:keywords": null,
      "schema:search": "AfD",
      "year": "2022"
   },
   {
      "@id": 1493736585262612481,
      "@type": [
         "schema:BlogPosting"
      ],
      "schema:about": "AfD",
      "schema:articleBody": "ICT issues Area Forecast Discussion (AFD) at Feb 15, 5:58 PM CST https://t.co/Rb65dwxaqJ",
      "schema:author": "twitter",
      "schema:creator": "iembot_ict",
      "schema:datePublished": "2022-02-15T23:58:48Z",
      "schema:headli

## Preprocessing

- Ideology annotation task: annotates ideology and narrative
- Detect language task: if language is not annotated by GSICrawler, it uses NLTK to detect it.
- Preprocessing Task: remove URLs and punctuation symbols, such as mentions or hashtags, and tokenize words.
- Geo annotation task: annotate location with Google Geocode API

In [10]:
preprocessing_results= []

for tweet in tweet_list.copy():
    if ideology == True:
        # Annotate ideology and narrative
        annotated_tweet = annotateIdeology(tweet)
    
    if annotated_tweet == None:
        continue

    # Detect language
    language_detected_tweet = detectLanguage(annotated_tweet)


    if language_detected_tweet == None:
        continue

    # Text preprocessing
    preproccesed_tweet = preprocessText(language_detected_tweet,source)

    # Annotate geolocalization
    annotated_geo_tweet = annotateGeolocation(preproccesed_tweet)
    
    preprocessing_results.append(annotated_geo_tweet)

print(json.dumps(preprocessing_results, indent=3))

[
   {
      "@id": 1493735358806245376,
      "@type": [
         "schema:BlogPosting"
      ],
      "schema:about": "AfD",
      "schema:articleBody": "ich habe gerade bei <user> einem mega header gesehen \u2026\n\nich lache mehr als ich sollte \ud83d\ude02\ud83d\ude02\n\n<hashtag> fckafd <allcaps> <allcaps> <hashtag> fcknzs <allcaps> <allcaps> <hashtag> noafd <hashtag> keinsexmitnazis <hashtag> nazissinddoof <hashtag> braunerhaufenkacki <hashtag> afd <url>",
      "schema:author": "twitter",
      "schema:creator": "ErsatzWaise",
      "schema:datePublished": "2022-02-15T23:53:56Z",
      "schema:headline": "Ich habe gerade bei @schrodi_m einem mega Header gesehen \u2026\n\nIch lache mehr als ich sollte \ud83d\ude02\ud83d\ude02\n\n#FCKAFD #FCKNZS #NoAfD #KeinSexMitNazis #NazisSindDoof #BraunerHaufenKacki #AfD https://t.co/FbhW9VuF68",
      "schema:inLanguage": "de",
      "schema:keywords": [
         "FCKAFD",
         "FCKNZS",
         "NoAfD",
         "KeinSexMitNazis",
     

## Sentiment Analysis

- LIWC analysis task: annotate the values obtained with LIWC dictionaries.
- MFT analysis task: annotate the values obtained with MFT dictionary.


In [11]:
sentimentAnalysis_results=[]

for tweet in preprocessing_results:
    liwc_tweet = sentimentAnalysis(tweet, senpy_host, algorithms, {
                        'apiKey': "",
                        'algo': algorithms,
                        "i": tweet["schema:articleBody"]
                    })

    mft_tweet = sentimentAnalysis(liwc_tweet, senpy_host, algorithms[-1], {
                        'apiKey': "",
                        'algo': algorithms[-1],
                        "i": tweet["schema:articleBody"]
                    })
    sentimentAnalysis_results.append(mft_tweet)

print(json.dumps(sentimentAnalysis_results, indent=3))

[
   {
      "@id": 1493735358806245376,
      "@type": [
         "schema:BlogPosting"
      ],
      "schema:about": "AfD",
      "schema:articleBody": "ich habe gerade bei <user> einem mega header gesehen \u2026\n\nich lache mehr als ich sollte \ud83d\ude02\ud83d\ude02\n\n<hashtag> fckafd <allcaps> <allcaps> <hashtag> fcknzs <allcaps> <allcaps> <hashtag> noafd <hashtag> keinsexmitnazis <hashtag> nazissinddoof <hashtag> braunerhaufenkacki <hashtag> afd <url>",
      "schema:author": "twitter",
      "schema:creator": "ErsatzWaise",
      "schema:datePublished": "2022-02-15T23:53:56Z",
      "schema:headline": "Ich habe gerade bei @schrodi_m einem mega Header gesehen \u2026\n\nIch lache mehr als ich sollte \ud83d\ude02\ud83d\ude02\n\n#FCKAFD #FCKNZS #NoAfD #KeinSexMitNazis #NazisSindDoof #BraunerHaufenKacki #AfD https://t.co/FbhW9VuF68",
      "schema:inLanguage": "de",
      "schema:keywords": [
         "FCKAFD",
         "FCKNZS",
         "NoAfD",
         "KeinSexMitNazis",
     