# Telegram Message Sentiment Analysis (**Vishal Kapadia**)


In [2]:
# Installing VaderSentiment
!pip install vaderSentiment



In [3]:
# Installing Spacy_langdetect
!pip install spacy_langdetect

Collecting spacy_langdetect
  Downloading spacy_langdetect-0.1.2-py3-none-any.whl (5.0 kB)
Collecting langdetect==1.0.7
  Downloading langdetect-1.0.7.zip (998 kB)
[K     |████████████████████████████████| 998 kB 5.2 MB/s 
Building wheels for collected packages: langdetect
  Building wheel for langdetect (setup.py) ... [?25l[?25hdone
  Created wheel for langdetect: filename=langdetect-1.0.7-py3-none-any.whl size=993430 sha256=e9f5408a0256fc553f80a7756c4dffb53ee780d4d714c973b9631d4e7aa3d2ea
  Stored in directory: /root/.cache/pip/wheels/89/79/3b/9885ae7f4308f73c514f96d8574d40d7d8173a27731b674013
Successfully built langdetect
Installing collected packages: langdetect, spacy-langdetect
Successfully installed langdetect-1.0.7 spacy-langdetect-0.1.2


In [4]:
# Installing Plotly
!pip install plotly



In [29]:
# Installing tqdm for Progress Monitoring
!pip install tqdm



In [31]:
#Importing necessary libraries
import pandas as pd
import numpy as np
import re
import json
from tqdm import tqdm
from collections import defaultdict
import nltk
from nltk.corpus import stopwords
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from nltk.stem.porter import *
import warnings
import csv
import datetime
from spacy_langdetect import LanguageDetector
import spacy
import plotly.io as pio
import plotly.graph_objects as go
warnings.filterwarnings("ignore")

In [34]:
#Reading the JSON File
for i in tqdm(range(1),desc="Reading Data from JSON File"):
    f = open('telegram.json')
    data = json.load(f)

Reading Data from JSON File: 100%|██████████| 1/1 [00:00<00:00,  4.81it/s]


In [35]:
#Setting up Spacy to detect English Language
for i in tqdm(range(1),desc="Setting up Spacy to detect English Language"):
    nlp = spacy.load('en')
    nlp.add_pipe(LanguageDetector(), name='language_detector', last=True)

Setting up Spacy to detect English Language: 100%|██████████| 1/1 [00:00<00:00,  1.77it/s]


In [36]:
# Extracting Telegram Messages in English Language
messages = []
date = []
lang = spacy.load('en')
lang.add_pipe(LanguageDetector(), name='language_detector', last=True)
for i in tqdm(range(len(data["messages"])),desc="Extracting Telegram Messages in English Language"):
    if isinstance(data["messages"][i]['text'],str):
        if lang(data["messages"][i]['text'])._.language['language'] == 'en':
            messages.append(data["messages"][i]['text'])
            date.append(data["messages"][i]['date'][0:10])

Extracting Telegram Messages in English Language: 100%|██████████| 49350/49350 [15:25<00:00, 53.32it/s]


In [37]:
# Extracting Telegram Messages which contains "SHIB" and "DOGE"
messages_final = []
date_final = []

for i in tqdm(range(len(messages)),desc="Extracting Telegram Messages which contains SHIB and DOGE"):
    if re.search("shib",messages[i], re.IGNORECASE) or re.search("doge", messages[i], re.IGNORECASE):
        messages_final.append(messages[i])
        date_final.append(date[i])

Extracting Telegram Messages which contains SHIB and DOGE: 100%|██████████| 32675/32675 [00:00<00:00, 180638.60it/s]


In [38]:
# Creating DataFrame which contains telegram messages alongwith date when message was posted
telegram_json = {}
telegram_json['date'] = date_final
telegram_json['messages'] = messages_final
telegram_msg = pd.DataFrame(telegram_json)
telegram_msg.to_csv('telegram_msg.csv')

In [39]:
telegram_msg.to_csv('telegram_msg.csv')

In [40]:
#To Display All Text
pd.set_option('display.max_colwidth',None)

# **Messages Pre-Processing for Calculating Sentiment Score**

**Extracting Stopwords**

In [41]:
#Downloading Stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [42]:
#Load English Stop Words
stopword = stopwords.words('english')
print("Stopwords:",stopword)

Stopwords: ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so'

**Messages Cleaning Function**

In [43]:
#Removing RT Word from Messages
telegram_msg['messages']=telegram_msg['messages'].str.lstrip('RT')
#Removing selected punctuation marks from Messages
telegram_msg['messages']=telegram_msg['messages'].str.replace( ":",'')
telegram_msg['messages']=telegram_msg['messages'].str.replace( ";",'')
telegram_msg['messages']=telegram_msg['messages'].str.replace( ".",'')
telegram_msg['messages']=telegram_msg['messages'].str.replace( ",",'')
telegram_msg['messages']=telegram_msg['messages'].str.replace( "!",'')
telegram_msg['messages']=telegram_msg['messages'].str.replace( "&",'')
telegram_msg['messages']=telegram_msg['messages'].str.replace( "-",'')
telegram_msg['messages']=telegram_msg['messages'].str.replace( "_",'')
telegram_msg['messages']=telegram_msg['messages'].str.replace( "$",'')
telegram_msg['messages']=telegram_msg['messages'].str.replace( "/",'')
telegram_msg['messages']=telegram_msg['messages'].str.replace( "?",'')
telegram_msg['messages']=telegram_msg['messages'].str.replace( "''",'')
#Lowercase
telegram_msg['messages']=telegram_msg['messages'].str.lower()

In [44]:
#Message Clean Function
def msg_clean(msg):
    #Remove URL
    msg = re.sub(r'https?://\S+|www\.\S+', " ", msg)

    #Remove Mentions
    msg = re.sub(r'@\w+',' ',msg)

    #Remove Digits
    msg = re.sub(r'\d+', ' ', msg)

    #Remove HTML tags
    msg = re.sub('r<.*?>',' ', msg)
    
    #Remove HTML tags
    msg = re.sub('r<.*?>',' ', msg)

    #Remove Stop Words 
    msg = msg.split()
    
    msg = " ".join([word for word in msg if word not in stopword])

    return msg

In [45]:
#Applying Message Clean Function
for i in tqdm(range(1),desc="Cleaning Telegram Messages for Tokenizing"):
    telegram_msg['Clean Messages'] = telegram_msg['messages'].astype(str).apply(lambda x: msg_clean(x))

Cleaning Telegram Messages for Tokenizing: 100%|██████████| 1/1 [00:00<00:00,  8.88it/s]


In [46]:
# Tokenize Data
for i in tqdm(range(1),desc="Generating Tokens"):
    tokenize_msg = telegram_msg['Clean Messages'].apply(lambda x: x.split()) 
    #tokenize_msg.head()

Generating Tokens: 100%|██████████| 1/1 [00:00<00:00, 135.60it/s]


**Tokenization**

In [47]:
#Tokenize the Messages
for i in tqdm(range(len(tokenize_msg)),desc="Tokenizing Messages"):
  tokenize_msg[i] = ' '.join(tokenize_msg[i])
telegram_msg['Clean Messages'] = tokenize_msg
#telegram_msg.head()

Tokenizing Messages: 100%|██████████| 2204/2204 [00:00<00:00, 81865.44it/s]


# Generating Sentiment Score of Telegram Messages

---



In [48]:
# Calculate Sentiment Scores
for i in tqdm(range(1),desc="Calculating Sentiment Scores"):
    analyser = SentimentIntensityAnalyzer()

    scores = []
    for sentence in telegram_msg['Clean Messages']:
        score = analyser.polarity_scores(sentence)
        scores.append(score)
    
    scores = pd.DataFrame(scores)
    telegram_msg['Compound'] = scores['compound']
    telegram_msg['Negative'] = scores['neg']
    telegram_msg['Neutral'] = scores['neu']
    telegram_msg['Positive'] = scores['pos']

Calculating Sentiment Scores: 100%|██████████| 1/1 [00:00<00:00,  5.69it/s]


In [49]:
telegram_msg.to_csv("final_sentiments.csv")

In [50]:
# List of Dates from May 1, 2021 to May 15, 2021
dates = telegram_msg['date'].unique().tolist()

In [51]:
# Calculating Number of Messages in a Day and Total Sentiments of Messages in that data
sum_sentiments = [0]*len(dates)
count_msg_freq = [0]*len(dates)
for i in tqdm(range(telegram_msg.shape[0]),desc="Counting Total Messages Per Day"):
    current_date = telegram_msg['date'][i]
    count_msg_freq[dates.index(current_date)] += 1
    sum_sentiments[dates.index(current_date)] += telegram_msg['Compound'][i]

Counting Total Messages Per Day: 100%|██████████| 2204/2204 [00:00<00:00, 59987.84it/s]


In [52]:
# Calculating Average Sentiments Per Day
avg_sentiments = []
for i in tqdm(range(len(dates)),desc="Calculating Average Sentiments Per Day"):
    avg_sentiments.append(sum_sentiments[i]/count_msg_freq[i])

Calculating Average Sentiments Per Day: 100%|██████████| 15/15 [00:00<00:00, 87139.28it/s]


# Exploratory Data Analysis for Sentiments

In [53]:
# Setting up Plot Renderer to PNG
for i in tqdm(range(1),desc="Setting up Plot Renderer to PNG"):
    png_renderer = pio.renderers["png"]
    png_renderer.width = 5000
    png_renderer.height = 1000

Setting up Plot Renderer to PNG: 100%|██████████| 1/1 [00:00<00:00, 6909.89it/s]


In [54]:
# Plot of Number of Telegram Message Per Day
for i in tqdm(range(1),desc="Plotting Number of Telegram Message Per Day"):
    fig = go.Figure(
        data=[go.Bar(x=dates,y=count_msg_freq)],
        layout_title_text="Number of Telegram Message Per Day"
    )
    fig.show()

Plotting Number of Telegram Message Per Day:   0%|          | 0/1 [00:00<?, ?it/s]

Plotting Number of Telegram Message Per Day: 100%|██████████| 1/1 [00:00<00:00,  6.84it/s]


In [55]:
# Plot of Average Sentiments of Telegram Message Per Day
for i in tqdm(range(1),desc="Plotting Average Sentiments of Telegram Message Per Day"):
    fig = go.Figure(
        data=[go.Bar(x=dates,y=avg_sentiments)],
        layout_title_text="Average Sentiments of Telegram Message Per Day"
    )
    fig.show()

Plotting Average Sentiments of Telegram Message Per Day:   0%|          | 0/1 [00:00<?, ?it/s]

Plotting Average Sentiments of Telegram Message Per Day: 100%|██████████| 1/1 [00:00<00:00,  7.03it/s]
