# Sentiment Analysis of Social Media Content
## Data Science BootCamp Project


In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [16]:
#   Loading the dataset
data = '../Data/sentimentdataset.csv'
df = pd.read_csv(data)
df = df.drop(columns=['Unnamed: 0','Unnamed: 0.1'])
df.sample(5)

Unnamed: 0,Text,Sentiment,Timestamp,User,Platform,Hashtags,Retweets,Likes,Country,Year,Month,Day,Hour
93,Bitter experience at the customer service dep...,Bitter,2023-02-26 15:00:00,CustomerWoes,Facebook,#BitterExperience #CustomerService,18.0,35.0,USA,2023,2,26,15
727,Collaborating on a science project that receiv...,Happy,2017-08-18 18:20:00,ScienceProjectSuccessHighSchool,Facebook,#ScienceFairWinner #HighSchoolScience,20.0,39.0,UK,2017,8,18,18
648,Accidentally spilled paint in art class. Abstr...,Embarrassed,2023-08-07 19:45:00,PaintSpillHighSchool,Instagram,#ArtisticMishaps #HighSchoolArt,22.0,45.0,UK,2023,8,7,19
367,Happiness blooms like flowers in a garden on a...,Happiness,2020-02-10 11:15:00,SunnyDayEnthusiast,Twitter,#Happiness #SunnyDay,15.0,30.0,India,2020,2,10,11
192,"Jealousy poisons my thoughts, resentment brew...",Jealousy,2018-08-05 16:30:00,PoisonedMind,Facebook,#Jealousy #Resentment,8.0,15.0,USA,2018,8,5,16


#### Data Quality Assessment

In [17]:
#### Data Quality Assessment
# Check for missing values
print(df.isnull().sum())

Text         0
Sentiment    0
Timestamp    0
User         0
Platform     0
Hashtags     0
Retweets     0
Likes        0
Country      0
Year         0
Month        0
Day          0
Hour         0
dtype: int64


In [18]:
# Checking for duplicate entries
print(f"Found {df.duplicated().sum()} duplicate rows.")

Found 20 duplicate rows.


We see 20 duplicate entries, so we need to remove them.

In [19]:
# Dropping duplicate rows
df = df.drop_duplicates()

In [20]:
# Check for consistency in categorical columns
print(df['Platform'].unique())
print(df['Sentiment'].unique())
print(df['Country'].unique())

[' Twitter  ' ' Instagram ' ' Facebook ' ' Twitter ']
[' Positive  ' ' Negative  ' ' Neutral   ' ' Anger        '
 ' Fear         ' ' Sadness      ' ' Disgust      ' ' Happiness    '
 ' Joy          ' ' Love         ' ' Amusement    ' ' Enjoyment    '
 ' Admiration   ' ' Affection    ' ' Awe          ' ' Disappointed '
 ' Surprise     ' ' Acceptance   ' ' Adoration    ' ' Anticipation '
 ' Bitter       ' ' Calmness     ' ' Confusion    ' ' Excitement   '
 ' Kind         ' ' Pride        ' ' Shame        ' ' Confusion '
 ' Excitement ' ' Shame ' ' Elation       ' ' Euphoria      '
 ' Contentment   ' ' Serenity      ' ' Gratitude     ' ' Hope          '
 ' Empowerment   ' ' Compassion    ' ' Tenderness    ' ' Arousal       '
 ' Enthusiasm    ' ' Fulfillment  ' ' Reverence     ' ' Compassion'
 ' Fulfillment   ' ' Reverence ' ' Elation   ' ' Despair         '
 ' Grief           ' ' Loneliness      ' ' Jealousy        '
 ' Resentment      ' ' Frustration     ' ' Boredom         '
 ' Anxiety

There are many duplicate values, appearing because of extra spaces in the values. Next step is to clean these columns.

In [21]:
# List all the columns to clean
categorical_cols = ['Platform', 'Sentiment', 'Country']

for col in categorical_cols:
    df[col] = df[col].str.strip()

In [24]:
# Check if the cleaning worked
print(df['Platform'].unique().shape)
print(df['Sentiment'].unique().shape)
print(df['Country'].unique().shape)

(3,)
(191,)
(33,)


In [26]:
# Check data validity
print(df.describe())

         Retweets       Likes         Year       Month         Day        Hour
count  712.000000  712.000000   712.000000  712.000000  712.000000  712.000000
mean    21.542135   42.966292  2020.476124    6.108146   15.514045   15.592697
std      7.124840   14.215995     2.828326    3.406099    8.444895    4.062778
min      5.000000   10.000000  2010.000000    1.000000    1.000000    0.000000
25%     17.750000   34.750000  2019.000000    3.000000   10.000000   13.000000
50%     22.000000   43.000000  2021.000000    6.000000   15.000000   16.000000
75%     25.000000   50.000000  2023.000000    9.000000   22.000000   19.000000
max     40.000000   80.000000  2023.000000   12.000000   31.000000   23.000000


In [None]:
# Assessing the text column
empty_posts = df[df['Text'].str.len() == 0].shape[0]
print(f"Found {empty_posts} posts with no text.")

Found 0 posts with no text.


In [28]:
# Final shape of the cleaned dataset
print(f"Final dataset shape: {df.shape}")

Final dataset shape: (712, 13)


## Exploratory data analysis (EDA):