# NLP

In [1]:
# Initial imports
import os
import pandas as pd
import numpy as np
from dotenv import load_dotenv
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk
from pathlib import Path
import re
nltk.download('vader_lexicon')
nltk.download('stopwords')
analyzer = SentimentIntensityAnalyzer()

%matplotlib inline

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/tamobee/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/tamobee/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# Read in the tweets csv as a pandas DataFrame
file_path = Path("Resources/elon_tweets.csv")
tweets_data = pd.read_csv(file_path)
tweets_data.head()

Unnamed: 0,date,tweet
0,2021-01-15 04:24:47,@SuperclusterHQ @w00ki33 Fallout New Texas
1,2021-01-15 03:23:28,@Breedlove22 @benmezrich Only Chuck Norris can...
2,2021-01-15 03:18:10,@Cerberu21014829 @Breedlove22 @benmezrich Good...
3,2021-01-15 02:12:06,@Breedlove22 @benmezrich The thing we call mon...
4,2021-01-15 01:44:22,Monty Python is amazing https://t.co/UJq94IWT88


In [3]:
# Functions for cleaning the raw tweet data
def remove_pattern(input_txt, pattern):
    r = re.findall(pattern, input_txt)
    for i in r:
        input_txt = re.sub(i, '', input_txt)        
    return input_txt

def clean_tweets(tweets):
    #remove twitter Return handles (RT @xxx:)
    tweets = np.vectorize(remove_pattern)(tweets, "RT @[\w]*:") 
    
    #remove twitter handles (@xxx)
    tweets = np.vectorize(remove_pattern)(tweets, "@[\w]*")
    
    #remove URL links (httpxxx)
    tweets = np.vectorize(remove_pattern)(tweets, "https?://[A-Za-z0-9./]*")
    
    #remove special characters, numbers, punctuations (except for #)
    tweets = np.core.defchararray.replace(tweets, "[^a-zA-Z]", " ")
    
    return tweets

In [4]:
# Create a column for cleaned tweet to original DataFrame
tweets_data['cleaned_tweet'] = clean_tweets(tweets_data['tweet'])
tweets_data

Unnamed: 0,date,tweet,cleaned_tweet
0,2021-01-15 04:24:47,@SuperclusterHQ @w00ki33 Fallout New Texas,Fallout New Texas
1,2021-01-15 03:23:28,@Breedlove22 @benmezrich Only Chuck Norris can...,Only Chuck Norris can divide by zero
2,2021-01-15 03:18:10,@Cerberu21014829 @Breedlove22 @benmezrich Good...,Good point
3,2021-01-15 02:12:06,@Breedlove22 @benmezrich The thing we call mon...,The thing we call money is just an informati...
4,2021-01-15 01:44:22,Monty Python is amazing https://t.co/UJq94IWT88,Monty Python is amazing
...,...,...,...
11846,2011-12-03 03:22:07,That was a total non sequitur btw,That was a total non sequitur btw
11847,2011-12-03 03:20:28,"Great Voltaire quote, arguably better than Twa...","Great Voltaire quote, arguably better than Twa..."
11848,2011-12-01 05:29:04,I made the volume on the Model S http://t.co/...,I made the volume on the Model S go to 11. ...
11849,2011-12-01 04:55:11,Went to Iceland on Sat to ride bumper cars on ...,Went to Iceland on Sat to ride bumper cars on ...


In [5]:
# Declare variables for scores
scores = []
compound_list = []
positive_list = []
negative_list = []
neutral_list = []

# Create the sentiment scores DataFrame for Cleaned Tweets
for i in range(tweets_data['cleaned_tweet'].shape[0]):
    try:
        compound = analyzer.polarity_scores(tweets_data['cleaned_tweet'][i])["compound"]
        pos = analyzer.polarity_scores(tweets_data['cleaned_tweet'][i])["pos"]
        neu = analyzer.polarity_scores(tweets_data['cleaned_tweet'][i])["neu"]
        neg = analyzer.polarity_scores(tweets_data['cleaned_tweet'][i])["neg"]
    
        scores.append({
            "Compound": compound, 
            "Positive": pos, 
            "Negative": neg, 
            "Neutral": neu
        })
    except AttributeError:
        pass

sentiment_scores = pd.DataFrame(scores)
sentiment_scores

Unnamed: 0,Compound,Positive,Negative,Neutral
0,0.0000,0.000,0.000,1.000
1,0.0000,0.000,0.000,1.000
2,0.4404,0.744,0.000,0.256
3,0.7317,0.162,0.000,0.838
4,0.5859,0.559,0.000,0.441
...,...,...,...,...
11846,0.0000,0.000,0.000,1.000
11847,0.5994,0.309,0.186,0.505
11848,0.0000,0.000,0.000,1.000
11849,0.8588,0.325,0.067,0.608


In [6]:
# Describe the Tweet sentiment
sentiment_scores.describe()

Unnamed: 0,Compound,Positive,Negative,Neutral
count,11851.0,11851.0,11851.0,11851.0
mean,0.181595,0.208528,0.04941,0.71624
std,0.365312,0.291304,0.124366,0.315792
min,-0.9638,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.5785
50%,0.0,0.084,0.0,0.805
75%,0.4404,0.302,0.0,1.0
max,0.9787,1.0,1.0,1.0


In [7]:
# Save the Vader scores DataFrame as a csv file
sentiment_scores.to_csv("vader_scores.csv")

In [8]:
# Join Tweets DataFrame and sentiment scores DataFrame
#scores_df = pd.DataFrame.from_dict(scores)
tweets_data = tweets_data.join(sentiment_scores)
tweets_data

Unnamed: 0,date,tweet,cleaned_tweet,Compound,Positive,Negative,Neutral
0,2021-01-15 04:24:47,@SuperclusterHQ @w00ki33 Fallout New Texas,Fallout New Texas,0.0000,0.000,0.000,1.000
1,2021-01-15 03:23:28,@Breedlove22 @benmezrich Only Chuck Norris can...,Only Chuck Norris can divide by zero,0.0000,0.000,0.000,1.000
2,2021-01-15 03:18:10,@Cerberu21014829 @Breedlove22 @benmezrich Good...,Good point,0.4404,0.744,0.000,0.256
3,2021-01-15 02:12:06,@Breedlove22 @benmezrich The thing we call mon...,The thing we call money is just an informati...,0.7317,0.162,0.000,0.838
4,2021-01-15 01:44:22,Monty Python is amazing https://t.co/UJq94IWT88,Monty Python is amazing,0.5859,0.559,0.000,0.441
...,...,...,...,...,...,...,...
11846,2011-12-03 03:22:07,That was a total non sequitur btw,That was a total non sequitur btw,0.0000,0.000,0.000,1.000
11847,2011-12-03 03:20:28,"Great Voltaire quote, arguably better than Twa...","Great Voltaire quote, arguably better than Twa...",0.5994,0.309,0.186,0.505
11848,2011-12-01 05:29:04,I made the volume on the Model S http://t.co/...,I made the volume on the Model S go to 11. ...,0.0000,0.000,0.000,1.000
11849,2011-12-01 04:55:11,Went to Iceland on Sat to ride bumper cars on ...,Went to Iceland on Sat to ride bumper cars on ...,0.8588,0.325,0.067,0.608


In [9]:
# Save the cleaned Tweets and sentiment scores DataFrame as a csv file 
tweets_data.to_csv("cleaned_tweets_vader_scores.csv")

## Tokenizer

In [10]:
# Imports
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from string import punctuation
import string
import re

lemmatizer = WordNetLemmatizer()
sw_addon = stopwords.words('english')
sw_addon.append("u")
sw_addon.append("it'")
sw_addon.append("char")
sw_addon.append("’")
sw_addon.append("…")
sw_addon.append("”")
sw_addon.append('“')
sw_addon.append('”')
sw_addon = set(sw_addon)

In [11]:
# Tokenize tweets
def tokenizer(text):
    """Tokenizes text."""   
    # Create a list of the words
    words = word_tokenize(text)
    # Convert the words to lowercase
    words = list(filter(lambda w: w.lower(), words))   
    # Remove the punctuation
    words = list(filter(lambda t: t not in punctuation, words))   
    # Remove the stopwords
    words = list(filter(lambda t: t.lower() not in sw_addon, words))   
    # Lemmatize Words into root words
    tokens = [lemmatizer.lemmatize(word) for word in words]
    
    return tokens

In [12]:
# Add the tokens column to the Tweets DataFrame
tweets_data["tokens"] = tweets_data.cleaned_tweet.apply(tokenizer)
tweets_data

Unnamed: 0,date,tweet,cleaned_tweet,Compound,Positive,Negative,Neutral,tokens
0,2021-01-15 04:24:47,@SuperclusterHQ @w00ki33 Fallout New Texas,Fallout New Texas,0.0000,0.000,0.000,1.000,"[Fallout, New, Texas]"
1,2021-01-15 03:23:28,@Breedlove22 @benmezrich Only Chuck Norris can...,Only Chuck Norris can divide by zero,0.0000,0.000,0.000,1.000,"[Chuck, Norris, divide, zero]"
2,2021-01-15 03:18:10,@Cerberu21014829 @Breedlove22 @benmezrich Good...,Good point,0.4404,0.744,0.000,0.256,"[Good, point]"
3,2021-01-15 02:12:06,@Breedlove22 @benmezrich The thing we call mon...,The thing we call money is just an informati...,0.7317,0.162,0.000,0.838,"[thing, call, money, information, system, labo..."
4,2021-01-15 01:44:22,Monty Python is amazing https://t.co/UJq94IWT88,Monty Python is amazing,0.5859,0.559,0.000,0.441,"[Monty, Python, amazing]"
...,...,...,...,...,...,...,...,...
11846,2011-12-03 03:22:07,That was a total non sequitur btw,That was a total non sequitur btw,0.0000,0.000,0.000,1.000,"[total, non, sequitur, btw]"
11847,2011-12-03 03:20:28,"Great Voltaire quote, arguably better than Twa...","Great Voltaire quote, arguably better than Twa...",0.5994,0.309,0.186,0.505,"[Great, Voltaire, quote, arguably, better, Twa..."
11848,2011-12-01 05:29:04,I made the volume on the Model S http://t.co/...,I made the volume on the Model S go to 11. ...,0.0000,0.000,0.000,1.000,"[made, volume, Model, go, 11, need, work, mini..."
11849,2011-12-01 04:55:11,Went to Iceland on Sat to ride bumper cars on ...,Went to Iceland on Sat to ride bumper cars on ...,0.8588,0.325,0.067,0.608,"[Went, Iceland, Sat, ride, bumper, car, ice, c..."


In [None]:
# Save the Tweets DataFrame with the tokens as a csv
tweets_data.to_csv("tokens.csv")

## NGrams and Frequency Analysis

In [None]:
# Imports
from collections import Counter
from nltk import ngrams
import inspect

In [None]:
def seriesToList(s):
    """"Converts series into one list"""
    
    lst = []      
    # traverse in the lists   
    for ele in s:  
        lst += ele     
    # return list   
    return lst

tweets = seriesToList(tweets_data["tokens"])

In [None]:
bigrams = ngrams(tweets, n=2)
tweets_dict = dict(Counter(bigrams).most_common(50))
tweets_bigrams = tweets_dict.items()
list(tweets_bigrams)

In [None]:
def token_count(tokens, N=15):
    """Returns the top N tokens from the frequency count"""
    return Counter(tokens).most_common(N)

In [None]:
token_count(tweets)