In [None]:
# !pip install instagrapi

In [4]:
from instagrapi import Client
import pickle
from math import inf as Inf

### GETTING THAT DATA

cl = Client()


USERNAME = "insta_username";
PASSWORD = "insta_password";
cl.login(USERNAME, PASSWORD) 
media_id = cl.media_id(cl.media_pk_from_url('https://www.instagram.com/p/CwJVnryNvxW/'))

# Change the second argument to contron the number of comments to extract from the video (in this case it is 20)
comments = cl.media_comments(media_id, 20)

with open('comments.pkl', 'wb') as f:
    pickle.dump(comments, f)

In [5]:
with open('comments.pkl', 'rb') as f:
    loaded_dict = pickle.load(f)

# Convert to pandas dataframe for easy manipulation
import pandas as pd 
df = pd.DataFrame(loaded_dict)
print("number of rows:", df.shape[0])

number of rows: 19


In [6]:
# Cleaning the data
first_row = df.iloc[0].to_list()
column_names = {}
for i, name in enumerate(df.columns):
    column_names[name] = first_row[i][0]

# Renamed the column headers to their actual meanings
df = df.rename(columns=column_names)


In [7]:
# Remove the headers in each data cell
for i, row in df.iterrows():
    for col in df.columns:
        df.at[i, col] = row[col][1]

print(dir(df.at[1, 'user']))

['Config', '__abstractmethods__', '__annotations__', '__class__', '__class_vars__', '__config__', '__custom_root_type__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__exclude_fields__', '__fields__', '__fields_set__', '__format__', '__ge__', '__get_validators__', '__getattribute__', '__getstate__', '__gt__', '__hash__', '__include_fields__', '__init__', '__init_subclass__', '__iter__', '__json_encoder__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__post_root_validators__', '__pre_root_validators__', '__pretty__', '__private_attributes__', '__reduce__', '__reduce_ex__', '__repr__', '__repr_args__', '__repr_name__', '__repr_str__', '__rich_repr__', '__schema_cache__', '__setattr__', '__setstate__', '__signature__', '__sizeof__', '__slots__', '__str__', '__subclasshook__', '__try_update_forward_refs__', '__validators__', '_abc_impl', '_calculate_keys', '_copy_and_set_values', '_decompose_class', '_enforce_dict_if_root', '_get_value', '_init_private_attribute

In [11]:
# Special edit to convert user > username
for i, row in df.iterrows():
    placeholder = row['user']
    df.at[i, 'user'] = placeholder.username
    df.at[i, 'user_id'] = placeholder.pk
df = df.rename(columns={'user': 'username'})
# print(df.head())

# I just need the text, username and timestamp
new_df = df.loc[:, ['text', 'username', 'created_at_utc']]
new_df

Unnamed: 0,text,username,created_at_utc
0,❤️❤️❤️,ayesha__2314,2023-08-28 06:28:51+00:00
1,Beshak,samia.javed.9465,2023-08-26 14:55:26+00:00
2,Excatly,loyal_rajpoot_17866,2023-08-25 17:31:31+00:00
3,Follow for more,raeesul_official,2023-08-24 15:15:24+00:00
4,Baishak ❤️❤️❤️,h.mussab,2023-08-24 13:39:45+00:00
5,Beshak,tasawarpathan786,2023-08-24 05:49:23+00:00
6,Beshak,miralmiral9070,2023-08-23 18:00:42+00:00
7,❤️❤️❤️❤️,rajputnaira30,2023-08-22 15:33:08+00:00
8,Beshk,kamal_khan5340,2023-08-22 14:52:04+00:00
9,Beshak,candajutt,2023-08-22 11:34:19+00:00


In [8]:
import requests
import pandas as pd
import numpy as np
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoConfig
from scipy.special import softmax


MODEL_NAME = 'cardiffnlp/twitter-roberta-base-sentiment-latest'


# Initialize tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
config = AutoConfig.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)

  from .autonotebook import tqdm as notebook_tqdm
Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [9]:
# Function to analyze sentiment
def analyze_sentiment(text):
    encoded_input = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=128)
    output = model(**encoded_input)
    scores = output.logits[0].detach().numpy()
    scores = softmax(scores, axis=0)

    # Labels corresponding to sentiment classes
    labels = ["negative", "neutral", "positive"]

    json_sent = {
        "label": labels[np.argmax(scores)],
        "probability": {
            "neg": scores[0],  # Negative sentiment score
            "neutral": scores[1],  # Neutral sentiment score
            "pos": scores[2]  # Positive sentiment score
        }
    }

    return json_sent


In [12]:
import pandas as pd
import numpy as np
 
# Assuming you already have 'new_df' with the comments

# Create a new DataFrame to store sentiment analysis results
result_df = pd.DataFrame(columns=['textDisplay', 'label', 'pos', 'neg', 'neutral'])

# Iterate through each row in the original DataFrame
for index, row in new_df.iterrows():
    comment = row['text']
    
    # Analyze sentiment for the comment using the analyze_sentiment function
    sentiment_result = analyze_sentiment(comment)
    
    # Add sentiment analysis results to the new DataFrame
    result_df.loc[index] = [
        comment,
        sentiment_result['label'],
        sentiment_result['probability']['pos'],
        sentiment_result['probability']['neg'],
        sentiment_result['probability']['neutral']
    ]

# Print the resulting DataFrame with sentiment analysis results
print(result_df.head())  # Print the first 5 entries

# Save the DataFrame to a CSV file
result_df.to_csv('sentiment_analysis_results.csv', index=False)


       textDisplay     label       pos       neg   neutral
0           ❤️❤️❤️  positive  0.944387  0.007543  0.048070
1           Beshak   neutral  0.207550  0.117964  0.674486
2          Excatly   neutral  0.226173  0.073018  0.700809
3  Follow for more   neutral  0.331225  0.023652  0.645123
4   Baishak ❤️❤️❤️  positive  0.956012  0.003340  0.040648


##### Calculating the overall sentiment of the comment section

In [13]:
# Calculate weighted average for pos, neg, and neutral columns
weighted_avg_pos = result_df['pos'].mean()
print("weighted_avg_pos ", weighted_avg_pos)
weighted_avg_neg = result_df['neg'].mean()
print("weighted_avg_neg ", weighted_avg_neg)

weighted_avg_neutral = result_df['neutral'].mean()
print("weighted_avg_neutral ", weighted_avg_neutral)


# Determine the label based on the greatest weighted average
max_weighted_avg = max(weighted_avg_pos, weighted_avg_neg, weighted_avg_neutral)
print("max_weighted_avg ",max_weighted_avg)
label = None
if max_weighted_avg == weighted_avg_pos:
    label = 'positive'
elif max_weighted_avg == weighted_avg_neg:
    label = 'negative'
else:
    label = 'neutral'

# Print the determined label
print(f"The Overall determined sentiment label is: {label}")


weighted_avg_pos  0.5563612
weighted_avg_neg  0.05886369
weighted_avg_neutral  0.38477513
max_weighted_avg  0.5563612
The Overall determined sentiment label is: positive


##### Calculating the absolute number of positive/negative/neutral comments

In [14]:
# Calculate the total number of comments
total_comments = len(result_df)

# Count the number of positive, negative, and neutral comments
num_positive_comments = result_df[result_df['label'] == 'positive']['textDisplay'].count()
num_negative_comments = result_df[result_df['label'] == 'negative']['textDisplay'].count()
num_neutral_comments = result_df[result_df['label'] == 'neutral']['textDisplay'].count()

# Print the results
print(f"Total Comments: {total_comments}")
print(f"Number of Positive Comments out of {total_comments} = {num_positive_comments}")
print(f"Number of Negative Comments out of {total_comments} = {num_negative_comments}")
print(f"Number of Neutral Comments out of {total_comments} = {num_neutral_comments}")


Total Comments: 19
Number of Positive Comments out of 19 = 9
Number of Negative Comments out of 19 = 0
Number of Neutral Comments out of 19 = 10


##### Calculating the percentage of positive/negative/neutral comments

In [15]:
# Calculate the percentages of positive, negative, and neutral comments
percentage_positive = (num_positive_comments / total_comments) * 100
percentage_negative = (num_negative_comments / total_comments) * 100
percentage_neutral = (num_neutral_comments / total_comments) * 100

# Print the results
print(f"Percentage of Positive Comments: {percentage_positive:.2f}%")
print(f"Percentage of Negative Comments: {percentage_negative:.2f}%")
print(f"Percentage of Neutral Comments: {percentage_neutral:.2f}%")


Percentage of Positive Comments: 47.37%
Percentage of Negative Comments: 0.00%
Percentage of Neutral Comments: 52.63%


##### The most common words that appeared in the positive comments

In [16]:
from nltk import FreqDist
import operator

import re
#the words that appear he most in positive reviews
import nltk
porter = nltk.PorterStemmer()
list_pos=[]
for i in range(len(result_df.loc[result_df['label'] == 'positive'])):
    list_pos.append(result_df.loc[result_df['label'] == 'positive']["textDisplay"].iloc[i])
lst_words_pos = []
for line in list_pos:
    text_pos = re.split('\n| |\?|\!|\:|\"|\(|\)|\...|\;',line)
    for word in text_pos:
        if (len(word)>3 and not word.startswith('@') and not word.startswith('#') and word != 'RT'):
            lst_words_pos.append(porter.stem(word.lower()))


dist_pos = FreqDist(lst_words_pos) 
sorted_dist_pos = sorted(dist_pos.items(), key=operator.itemgetter(1), reverse=True)
sorted_dist_pos[:50]


[('❤️❤️❤️', 2),
 ('bashak', 2),
 ('beshak', 2),
 ('baishak', 1),
 ('❤️❤️❤️❤️', 1),
 ('👍👍👍👌', 1),
 ('subhan', 1),
 ('allah', 1),
 ('❤️❤️', 1),
 ('یاحضرت', 1),
 ('علی❤️🙌', 1),
 ('beshaq🔥🔥🔥', 1),
 ('👍👍👍💯💯', 1),
 ('💯👌👌👌', 1)]

##### A list of the common words that appeared in the negative comments

In [17]:
list_neg=[]
for i in range(len(result_df.loc[result_df['label'] == 'negative'])):
    list_neg.append(result_df.loc[result_df['label'] == 'negative']["textDisplay"].iloc[i])
lst_words_neg = []
for line in list_neg:
    text_neg = re.split('\n| |\?|\!|\:|\"|\(|\)|\...|\;',line)
    for word in text_neg:
        if (len(word)>3 and not word.startswith('@') and not word.startswith('#') and word != 'RT'):
            lst_words_neg.append(porter.stem(word.lower()))
dist_neg = FreqDist(lst_words_neg) 
sorted_dist_neg = sorted(dist_neg.items(), key=operator.itemgetter(1), reverse=True)
sorted_dist_neg[:50]

[]