In [1]:
import tweepy
import pandas as pd
import requests
import json
import emoji
from unicodedata import normalize

In [2]:
import re
import matplotlib.pyplot as plt
import nltk
from nltk import sent_tokenize, word_tokenize
from nltk.corpus import stopwords 
from nltk.stem import SnowballStemmer, PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer

nltk.download('vader_lexicon')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/eduarde/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package wordnet to /home/eduarde/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/eduarde/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/eduarde/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [47]:
auth = tweepy.OAuthHandler(API_KEY, API_SECRET)
auth.set_access_token(ACCESS_TOKEN, ACCESS_TOKEN_SECRET)
api = tweepy.API(auth ,wait_on_rate_limit=True, wait_on_rate_limit_notify=True)

In [136]:
def get_tweets(hashtag_list):
    tweets_amlo = pd.DataFrame(columns=['user', "date_time", "likes", "tweet"])
    for hashtag in hashtag_list:
        search_tw = tweepy.Cursor(api.search, q=f"{hashtag} -filter:retweets", tweet_mode='extended', count=2000).items(2000)
        tweets = [[tweet.user.screen_name, str(tweet.created_at), tweet.user.favourites_count, tweet.full_text] 
                   for tweet in search_tw]
        tweets_df = pd.DataFrame(data=tweets, columns=['user', "date_time", "likes", "tweet"])
        tweets_amlo = pd.concat([tweets_amlo, tweets_df])
    return tweets_amlo

In [137]:
hashtag_list = ['#PresidenteRompaElPacto', '#AMLOEnfermoMental', '#AMLOEstamosContigo', '#AMLORenuncia', '#RenunciaAMLO', '#AMLOElFracasoPresidencial',
                '#AMLOMasFuerteQueNunca', '#LopezFracasoPresidencial', '#MentirasDeCuarta', '#AMLOMuestraTusDatos', '#AMLOPresidenteDeLaSalud',
                '#AMLOLujoDePresidente']

tweets_amlo = get_tweets(hashtag_list)

In [187]:
with open('spanish_offensive_words') as f:
    bad_words = [x.strip() for x in f.readlines()]

In [228]:
emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U0001F1F2-\U0001F1F4"  # Macau flag
        u"\U0001F1E6-\U0001F1FF"  # flags
        u"\U0001F600-\U0001F64F"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U0001F1F2"
        u"\U0001F1F4"
        u"\U0001F620"
        u"\u200d"
        u"\u2640-\u2642"
        "]+", flags=re.UNICODE)

In [251]:
def tokenizer(tweet):
    tweet = re.sub("@[A-Za-z0-9]+","",tweet) #Remove @ sign
    tweet = re.sub(r"(?:\@|http?\://|https?\://|www)\S+", "", tweet) #Remove http links
    allchars = [str for str in tweet]
    emoji_list = [c for c in allchars if c in emoji.UNICODE_EMOJI]
    tweet = ' '.join([str for str in tweet.split() if not any(i in str for i in emoji_list)])
    #tweet = ''.join(c for c in tweet if c not in emoji.UNICODE_EMOJI) #Remove Emojis
    tweet = emoji_pattern.sub(r'', tweet)
    tweet = emoji.get_emoji_regexp().sub(u'', tweet)
    tweet = ' '.join(' '.join(re.findall('[A-Z][^A-Z]*', word)) if word.startswith('#') else word for word in tweet.split())
    tweet = tweet.replace("#", "").replace("_", " ") #Remove hashtag sign but keep the text
    tweet = tweet.strip()
    stop_words = set(stopwords.words('spanish'))
    clean_text = re.sub(r'\W+', ' ', tweet).lower()
    removed_numbers = ''.join(filter(lambda x: not x.isdigit(), clean_text))
    removed_singles = re.sub(r"\b[a-zA-Z]\b", "", removed_numbers)
    removed_stopwords = [word for word in word_tokenize(removed_singles) if not word in stop_words]
    filtered_text = ' '.join(removed_stopwords)
    return filtered_text

In [299]:
tweets_amlo['tokenize_tweets'] = tweets_amlo['tweet'].map(lambda x: tokenizer(x))

In [387]:
tweets_amlo['offensive'] = tweets_amlo['tokenize_tweet'].str.contains('|'.join(bad_words))

In [388]:
tweets_amlo['offensive'].replace({False: int(0), True: int(1)}, inplace=True)

In [390]:
tweets_amlo.offensive.value_counts()

0    12017
1     1569
Name: offensive, dtype: int64

In [None]:
from sentiment_analysis_spanish import sentiment_analysis

def get_sentiment_analysis(text):
    sentiment = sentiment_analysis.SentimentAnalysisSpanish()
    analysis = sentiment.sentiment(text)
    return np.format_float_positional(analysis, trim='-')

In [None]:
tweets_amlo["sentiment"] = tweets_amlo["tokenized_tweet"].map(lambda x: get_sentiment_analysis(x))

In [None]:
def get_hate_speech(offensive, sentiment):
    if (offensive == 1) and (sentiment < 0.5):
        return 1
    elif (offensive == 1) and (sentiment > 0.5):
        return 0
    elif (offensive == 0) and (sentiment < 0.1):
        return 1
    elif (offensive == 0) and (sentiment > 0.1):
        return 0
    else:
        return 0   

In [None]:
func = lambda x: get_hate_speech(offensive=x.offensive, sentiment=x.sentiment)
tweets_amlo["hate_speech"] = tweets_amlo.apply(func, axis=1)