In [5]:
"""
Contains various functions to process texts, mainly tweets.
"""

import string
import re
import pandas as pd
from textblob import TextBlob
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from utils import extract_data
import settings

nltk.download("stopwords")
nltk.download("punkt")

def clean_text(text):
    # remove hyperlink
    text = re.sub(r"(?:https?\://)\S+", "", text)

    # remove special character
    text = "".join([char for char in text if char not in string.punctuation])

    # remove numbers
    text = re.sub(r"[0-9]+", "", text)

    # remove non-ascii characters, mainly different languages and emojis.
    text = text.encode('ascii', 'ignore').decode()

    return text


def remove_stopwords(text):
    stop_words = set(stopwords.words("english"))
    word_tokens = word_tokenize(text)
    filtered_sentence = [w for w in word_tokens if not w in stop_words]
    filtered_sentence = " ".join(filtered_sentence)
    return filtered_sentence

def preprocess_text(text):
    text = clean_text(text)
    text = remove_stopwords(text)
    return text

def analyze_sentiment(text):
    text = TextBlob(text)
    polarity = text.sentiment.polarity
    subjectivity = text.sentiment.subjectivity
    return polarity, subjectivity

df = extract_data(settings.table_name)

df["text_punct_removed"] = df["text"].apply(clean_text)
df["stopwords_removed"] = df["text_punct_removed"].apply(remove_stopwords)
df["polarity"] = df["stopwords_removed"].apply(lambda x: analyze_sentiment(x)[0])
df.tail(10)


[nltk_data] Downloading package stopwords to /home/yang/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/yang/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Unnamed: 0,id_str,created_at,text,user_location,sentiment,text_punct_removed,stopwords_removed,polarity
121,1202655908184113159,2019-12-05 18:28:07,Interested in HudsonValley fresh apple crisp a...,"Poughkeepsie, NY",POSITIVE (0.9953964352607727),Interested in HudsonValley fresh apple crisp a...,Interested HudsonValley fresh apple crisp ice ...,0.2
122,1202655911413608448,2019-12-05 18:28:08,Trump once suggested all of Seouls million re...,"Chula Vista, CA",NEGATIVE (0.5060856342315674),Trump once suggested all of Seouls million re...,Trump suggested Seouls million residents move ...,0.0
123,1202655916870356992,2019-12-05 18:28:09,Whoever told me iPhone couldnt become G unless...,Myisha’s World Bitch🖕🏾,POSITIVE (0.8268633484840393),Whoever told me iPhone couldnt become G unless...,Whoever told iPhone couldnt become G unless Ap...,0.0
124,1202655919642775553,2019-12-05 18:28:10,Apple Music CEO to its staff,"Los Angeles, CA",POSITIVE (0.9050127267837524),Apple Music CEO to its staff,Apple Music CEO staff,0.0
125,1202655923766013955,2019-12-05 18:28:11,Making apple pie today What a day for it app...,"Houston, TX",POSITIVE (0.780019998550415),Making apple pie today What a day for it app...,Making apple pie today What day apples eateats...,0.0
126,1202655924684566528,2019-12-05 18:28:11,AppleMusicReplay CHECK ME OUT FOR YOUR REPLAY...,"New York, NY",POSITIVE (0.9254633188247681),AppleMusicReplay CHECK ME OUT FOR YOUR REPLAY...,AppleMusicReplay CHECK ME OUT FOR YOUR REPLAY ...,0.0
127,1202655924579520513,2019-12-05 18:28:11,heavy,"Honolulu, HI",POSITIVE (0.978206992149353),heavy,heavy,-0.2
128,1202655924881633282,2019-12-05 18:28:11,Brew Day CNPodNet ECNPodcast ChePJF CainMt Ke...,,POSITIVE (0.9993342757225037),Brew Day CNPodNet ECNPodcast ChePJF CainMt Ke...,Brew Day CNPodNet ECNPodcast ChePJF CainMt Ken...,0.0
129,1202655926945083392,2019-12-05 18:28:11,Tesla CyberTruck amp Apple iPhone \nThe Disru...,,POSITIVE (0.9914631247520447),Tesla CyberTruck amp Apple iPhone \nThe Disru...,Tesla CyberTruck amp Apple iPhone The Disrupti...,0.2
130,1202655927112884224,2019-12-05 18:28:11,Real one thumbmuthafucker,"Oregon, USA",POSITIVE (0.9439494609832764),Real one thumbmuthafucker,Real one thumbmuthafucker,0.2
