<a href="https://colab.research.google.com/github/w1756015-m-zidani/Final-Year-Project/blob/main/Final_verson_tweet_cleaning_code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
pip install vaderSentiment

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
pip install afinn

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


Import Librarys

In [3]:
import pandas as pd
import re
import string
import nltk
from textblob import TextBlob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from afinn import Afinn
import spacy


# Load the spaCy language model
nlp = spacy.load('en_core_web_sm')


Define Functions for Tokenization and Cleaning of Tweets

In [5]:

# Define a function to perform tokenization and lemmatization using spaCy
def tokenize_and_lemmatize(text):
    # Tokenize the text using spaCy
    doc = nlp(text)
    # Lemmatize each token and remove punctuation, stop words, and whitespace
    tokens = [token.lemma_.lower().strip() for token in doc if not token.is_punct and not token.is_stop and not token.is_space]
    # Join the tokens back into a string
    return ' '.join(tokens)

# Define a function to clean the tweets
def clean_tweet(tweet):
    # Remove URLs
    tweet = re.sub(r'http\S+', '', tweet)
    # Remove mentions and hashtags
    tweet = re.sub(r'@\w+|#\w+', '', tweet)
    # Remove punctuation
    tweet = tweet.translate(str.maketrans('', '', string.punctuation))
    # Remove non-alphanumeric characters
    tweet = re.sub(r'[^A-Za-z0-9\s]+', '', tweet)
    # Convert everything to lowercase
    tweet = tweet.lower()
    # Tokenize and lemmatize the tweet
    tweet = tokenize_and_lemmatize(tweet)


    return tweet


Load & clean the Data

In [6]:
# Load the dataset
df = pd.read_csv('Demo Tweets.csv', encoding='ISO-8859-1')

# Clean the tweets
df['cleaned_text'] = df['Tweet'].apply(clean_tweet)


In [7]:
df.head()


Unnamed: 0,Tweet,cleaned_text
0,The Daily Mail has really turned on Boris John...,daily mail turn boris johnson guess owner edit...
1,"????Boris Johnson, primer ministro británico, ...",boris johnson primer ministro britnico dijo qu...
2,More Tax payers money Laundering through Boris...,tax payer money laundering boris johnson bill ...
3,@paulwaugh We love Boris Johnson \n\nThe Briti...,love boris johnson british people place trust ...
4,Just like the time he reframed the google resu...,like time reframe google result boris johnson ...


Remove non-english tweets

In [8]:
!pip install langdetect

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [9]:
from langdetect import detect

In [10]:

# Define a function to remove non-English tweets
def remove_non_english_tweets(df):
    df['lang'] = df['Tweet'].apply(lambda x: detect(x))
    df = df[df['lang']=='en']
    df = df.drop(['lang'], axis=1)
    return df

In [11]:
df2 = remove_non_english_tweets(df)

In [12]:
df2.shape

(955, 2)

In [13]:
df2.head()

Unnamed: 0,Tweet,cleaned_text
0,The Daily Mail has really turned on Boris John...,daily mail turn boris johnson guess owner edit...
2,More Tax payers money Laundering through Boris...,tax payer money laundering boris johnson bill ...
3,@paulwaugh We love Boris Johnson \n\nThe Briti...,love boris johnson british people place trust ...
4,Just like the time he reframed the google resu...,like time reframe google result boris johnson ...
6,@44tunafish @northumbrian_ @BootsVernon @Jim_C...,like boris johnson explicitly not want people ...


Perform Sentiment Analysis using TextBlob

In [14]:
# Perform sentiment analysis using TextBlob
df2['polarity_textblob'] = df2['cleaned_text'].apply(lambda x: TextBlob(x).sentiment.polarity)


Perform Sentiment Analysis using VADER

In [15]:
# Perform sentiment analysis using VADER
analyzer = SentimentIntensityAnalyzer()
df2['sentiment_vader'] = df2['cleaned_text'].apply(lambda x: analyzer.polarity_scores(x)['compound'])


Perform Sentiment Analysis using AFINN

In [16]:
# Perform sentiment analysis using AFINN
afinn = Afinn()
df2['sentiment_afinn'] = df2['cleaned_text'].apply(lambda x: afinn.score(x))


Print Result

In [17]:
# Print the results
df2.head()

Unnamed: 0,Tweet,cleaned_text,polarity_textblob,sentiment_vader,sentiment_afinn
0,The Daily Mail has really turned on Boris John...,daily mail turn boris johnson guess owner edit...,0.0,0.0,0.0
2,More Tax payers money Laundering through Boris...,tax payer money laundering boris johnson bill ...,0.0,0.0,0.0
3,@paulwaugh We love Boris Johnson \n\nThe Briti...,love boris johnson british people place trust ...,0.433333,0.9118,7.0
4,Just like the time he reframed the google resu...,like time reframe google result boris johnson ...,0.175,0.6249,2.0
6,@44tunafish @northumbrian_ @BootsVernon @Jim_C...,like boris johnson explicitly not want people ...,-0.5,-0.2076,1.0


In [17]:
import pandas as pd
df2.to_excel('testoutputdeleteafter.xlsx', index=False)

Calculating propotion of sentiment

In [18]:
def calculate_sentiment_proportions(df, sentiment_column):
    # Get the number of positive, negative, and neutral tweets
    num_positive = len(df[df[sentiment_column] > 0])
    num_negative = len(df[df[sentiment_column] < 0])
    num_neutral = len(df[df[sentiment_column] == 0])
    # Calculate the proportions of positive, negative, and neutral tweets
    total = num_positive + num_negative + num_neutral
    prop_positive = (num_positive / total)*100
    prop_negative = (num_negative / total)*100
    prop_neutral = (num_neutral / total)*100
    # Print Output

    print("Postive", prop_positive)
    print("Negative", prop_negative)
    print("Neutral", prop_neutral)


In [19]:
#Textblob
print("Textblob:")
calculate_sentiment_proportions(df2,'polarity_textblob')
print()
#VADAR
print("VADAR:")
calculate_sentiment_proportions(df2,'sentiment_vader')
print()
print("AFINN")
#AFINN
calculate_sentiment_proportions(df2,'sentiment_afinn')


Textblob:
Postive 24.083769633507853
Negative 30.157068062827225
Neutral 45.75916230366492

VADAR:
Postive 29.214659685863875
Negative 46.38743455497382
Neutral 24.397905759162303

AFINN
Postive 21.57068062827225
Negative 52.35602094240838
Neutral 26.07329842931937
