<a href="https://colab.research.google.com/github/tomascortes/the-office-script-analisis/blob/main/TheOffice.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# The Office report creator

This is a analisis on the script of all the seasons of the famous show 'The Office'

# Import of libraries and loading data

In [None]:
import pandas as pd
import numpy as np
from collections import defaultdict
import spacy
import regex as re
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from statistics import mode 

In [None]:
url = 'https://drive.google.com/uc?id=1MpIPWZuiUspcfI-oYe1n-f3B821SB_Sv'

raw_df = pd.read_csv(url,  encoding='utf8', index_col=0)
raw_df.head()
print(len(raw_df))

59909


In [None]:
from urllib.request import urlopen
  
import json
stop_words_url = 'https://drive.google.com/uc?id=1t8ne3QP9QdFaYPt5NBR25V05ULH20pWw'
response = urlopen(stop_words_url)
stop_words = json.loads(response.read())
print(stop_words)

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', 'her', 'here', 'hers', 'herself', 'him', 'himself', 'his', 'how', 'i', 'if', 'in', 'into', 'is', 'isn', "isn't", 'it', "it's", 'its', 'itself', 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she', "she's", 'should', "should've", 'shouldn', "shouldn't", 'so', 'some',

# Fixing errors



## Missclassified line_text
There were missclasified rows with speaker and line_text switched.

In [None]:
for row in raw_df[(
          (raw_df['speaker'].str.len() >= 20) 
          & (raw_df['line_text'].str.len() < 30) 
          & (raw_df['line_text'].str.contains(':'))
      )].iloc:
    print(row)
    print(row.speaker)
    print('--'*10)

season                                                       9
episode                                                      4
scene                                                        1
line_text                                               Kevin:
speaker      Group: Dunder Mifflin!\nAndy: Andy Bernard pre...
deleted                                                  False
Name: 53564, dtype: object
Group: Dunder Mifflin!
Andy: Andy Bernard presents: Summer Softball Epic Fails! [Kevin swings bat on screen, fart noise follows] Fail. [repeats] Fail
--------------------
season                           9
episode                          4
scene                            1
line_text                   Oscar:
speaker      [repeats]\nAndy: Fail
deleted                      False
Name: 53565, dtype: object
[repeats]
Andy: Fail
--------------------
season                                                       9
episode                                                      4
scene              

This where clearly missclasified. The text in speakers includes the speaker and the line_text.

In [None]:
def remove_missclassified(raw_df):
# Removing the missclasified rows
  df = raw_df[~(
            (raw_df['speaker'].str.len() >= 20) 
            & (raw_df['line_text'].str.len() < 30) 
            & (raw_df['line_text'].str.contains(':'))
        )]
  return df

test_df = remove_missclassified(raw_df)
# Check if they were removed
print( 'Check Removed: {}'.format(
    0 == len(                                     
      test_df[(
          (test_df['speaker'].str.len() >= 20) 
          & (test_df['line_text'].str.len() < 30) 
          & (test_df['line_text'].str.contains(':'))
      )])
    ))

Check Removed: True


Adding the fixed dialogs. Because is just 3 cases we will fix it manually (I watched the episode to check the original dialog)

In [None]:
def make_row(speaker:str, line_text:str) -> dict:
  ep_base = {
      'season': 9,
      'episode': 4,
      'scene': 1,
      'line_text': line_text,
      'speaker': speaker,
      'deleted': False,
  }
  return ep_base

def add_fixed_rows(raw_df):
  new_rows = [    
      make_row('Group', 'Dunder Mifflin!'),
      make_row('Andy Bernard',  'Summer Softball Epic Fails! [Kevin swings bat on screen, fart noise follows] Fail. [repeats] Fail'),
      make_row('Andy Bernard',  'Fail'),
      make_row('Andy Bernard', 'Hey, I'm Pete, puberty is such a drag, man. And I'm Clark! I like to eat toilet paper. [Clark and Pete wave at camera] We fail! [Video shows memorial of Jerry')
  ]
  df = raw_df.append(new_rows, ignore_index=True)
  return df

  
test_df = add_fixed_rows(raw_df)
print(f'\nNew len: {len(test_df)} | Old len: {len(raw_df)}')
# Check if the line dunder miffling was correctly added
test_df[(
(test_df['line_text'].str.contains('Dunder Mifflin!'))
      )]


New len: 59913 | Old len: 59909


Unnamed: 0,season,episode,scene,line_text,speaker,deleted
9399,2,22,50,Excuse me. Big moment. The evening's chip lead...,Bob,False
14950,3,18,54,I graduated from anger management the same way...,Andy,False
26724,5,13,2,[blowing air horn] Attention everyone! Employe...,Dwight,False
34101,6,10,49,"[running out of the building with Oscar, Andy,...",Michael,False
59909,9,4,1,Dunder Mifflin!,Group,False


# General cleaning
Using just the undeleted scenes (we are interested in the dialags that where in the serie)

In [None]:
def delete_deleted_text(raw_df):
  return raw_df[~raw_df['deleted']][['season',	'episode',	'scene',	'line_text',	'speaker']]

Deleting the acction in the speakers. Ex:
- Jan [on phone]
- JIM9334 [screen name]
- Hank [the security guard]
- Female church member [to Michael]

We know that Hank the security guy, is the same as Hank. So we can delete the parentesis. 

In [None]:
def delete_actions_in_speakers(row):
  if '[' in row['speaker']:
    row.speaker = re.sub('[\(\[].*?[\)\]]', '', row.speaker)
    row.speaker = row.speaker.strip()
  return row

For the later text analisis is important to clean the punctuation

In [None]:
def clean_punctuation(line):
  puntuaction = ['.,;']
  for simbol in puntuaction:
    line = line.replace(simbol, ' ')
  
  apostrophe = ['\'"´`']
  for simbol in apostrophe:
    line = line.replace(simbol, '')
  return line

## Apply all filters

In [None]:
def data_cleaning(raw_df):
  # removing and adding, columns and rows
  raw_df = remove_missclassified(raw_df)
  raw_df = add_fixed_rows(raw_df).reset_index()
  raw_df = delete_deleted_text(raw_df)
  # Lower case the names
  raw_df.speaker = raw_df.speaker.apply(
      lambda row: row.lower())
  
  # filters on each line
  raw_df = raw_df.apply(delete_actions_in_speakers, axis=1)

  return raw_df
data_cleaning(raw_df)

Unnamed: 0,season,episode,scene,line_text,speaker
0,1,1,1,All right Jim. Your quarterlies look very good...,michael
1,1,1,1,"Oh, I told you. I couldn't close it. So...",jim
2,1,1,1,So you've come to the master for guidance? Is ...,michael
3,1,1,1,"Actually, you called me in here, but yeah.",jim
4,1,1,1,"All right. Well, let me show you how it's done.",michael
...,...,...,...,...,...
59905,9,23,116,I thought it was weird when you picked us to m...,pam
59906,9,4,1,Dunder Mifflin!,group
59907,9,4,1,Summer Softball Epic Fails! [Kevin swings bat ...,andy bernard
59908,9,4,1,Fail,andy bernard


# Util general functions

In [None]:
def get_dict_episodes_per_season(df):
  df_episodes = df[['season', 'episode']]
  df_episodes = df_episodes.drop_duplicates(['season','episode'], keep='first')
  ep_per_season = df_episodes.groupby(['season']).episode.count()
  return ep_per_season.to_dict()

def get_dict_scenes_per_episode(df):
  df_episodes = df[['season', 'episode', 'scene']]
  scenes_per_ep = df_episodes.groupby(['season', 'episode']).episode.count()
  return scenes_per_ep.to_dict()

# Questions


## 1 How many characters are there? What are their names?


First we want to separate speakers when two or more speakers say the same line

- Andy and Dwight

In [None]:
def print_bad_formated_speakers(row):
  if '[' in row['speaker']:
    print(row.speaker)
  elif '&' in row['speaker']:
    print(row.speaker)
  elif ',' in row['speaker']:
    print(row.speaker)
  elif 'and ' in row['speaker']:
    print(row.speaker)

In [None]:
df = data_cleaning(raw_df)
df.apply(print_bad_formated_speakers, axis=1)
print()

michael and dwight
pam and jim
dwight and michael
dwight and michael
dwight and michael
michael and christian
michael and christian
michael and jim
michael and jim
kevin & oscar
phyllis, meredith, michael, kevin
darryl and katy
jim and pam
pam and others
michael and dwight
jim and dwight
michael and dwight
michael and dwight
dwight and michael
jim, josh, and dwight
ryan and others
andy and jim
andy and jim
andy and jim
andy and jim
michael and dwight
michael and dwight
michael & dwight
michael and dwight
andy & michael
andy & michael
andy and michael
pam and jim
oscar and stanley
andy, creed, kevin, kelly, darryl
andy, creed, kevin, kelly
andy, creed, kevin, kelly
andy, creed, kevin, kelly
andy, creed, kevin, kelly
andy, creed, kevin, kelly
dwight and michael
michael and dwight
michael and dwight
michael and darryl
angela and dwight
michael & holly
michael and holly
michael, holly, and darryl
holly & darryl
holly & michael
michael & holly
michael & holly
michael & holly
jim and pam
kev

In [None]:
def split_jinx(row):
  """ Also referd as 'Jinx, buy me a coke!' """
  for union_char in ['&', ',', 'and ']:
    row.speaker = row.speaker.strip()
    if union_char in row.speaker:
      row.speaker = row.speaker.replace('and ', ',').replace('&', ',')
      row.speaker = re.findall(r'[\w']+', row.speaker)
      break

  return row

def separated_jinx_dialogs(raw_df):
  df = data_cleaning(raw_df)
  df = df.apply(split_jinx, axis=1)
  df = df.explode('speaker').reset_index()
  return df

In [None]:
df_speakers_splited = separated_jinx_dialogs(raw_df)
speakers_count = df_speakers_splited.speaker.value_counts()
len(speakers_count)

689

We can say that strictly there are 589 characters in the 9 seasons of the office. 

Because we want to get the characters that have more impact in the serie i wanted to filter by how many times they spoke.

In [None]:
dialgs = []
label = []

for lines in range(0, 250, 20):
  dialgs.append(sum( dialogs > lines for dialogs in speakers_count)) 
  label.append(f'Characters that spoke more than {lines} lines')

In [None]:
fig = go.Figure(go.Bar(
            x=dialgs,
            y=label,
            orientation='h'))

fig.show()


To be considered a character, is probably best to set a minimun lines spoken in the hole 9 seasons. 

We can se that the amount of characters stabilize around 30-40. Here we can avoid considering extras as characters. And from now on, with respect to them we will consider the following questions.

Their names are: 

In [None]:
main_characters = speakers_count[:35].index.tolist()

names = '  '.join(main_characters[:10])   \
  + '\n' \
  + '  '.join(main_characters[10:20]) \
  + '\n' \
  + '  '.join(main_characters[20:30]) \
  + '\n' \
  + '  '.join(main_characters[30:]) 

print(names)

michael  dwight  jim  pam  andy  kevin  angela  erin  oscar  ryan
darryl  phyllis  kelly  jan  toby  stanley  meredith  holly  nellie  creed
gabe  robert  david  karen  clark  deangelo  roy  charles  pete  jo
david wallace  carol  katy  donna  val


## 2 For each character, find out who has the most lines across all episodes

Using the same dataset as before we can get that solution easily

In [None]:
print('Character with most lines: ',
      df_speakers_splited.speaker.value_counts().idxmax())
print('Amount of lines: ', speakers_count[0])

Character with most lines:  michael
Amount of lines:  11625


## 3 What is the average of words per line for each character?

For this question we will clean the punctuation and remove the stop words, to check the real words per line. 

In [None]:
def get_words_per_line_features(df, main_characters, stop_words):
  # Keep just main characters
  df = df[df.speaker.isin(main_characters)]
  # Clean punctuation
  amount_of_words_line = df.line_text.apply(clean_punctuation)
  # Get amount of words per line
  amount_of_words_line =  amount_of_words_line.apply(
    lambda row: len([word for word in row.split() if word not in (stop_words)])
  )
  df = df.assign(amount_of_words_line=amount_of_words_line)
  # Group by spealer and get stats of the amount of words
  df_grouped = df.groupby('speaker') \
    .amount_of_words_line.agg(
        ['max', 'min', 'count', 'median', 'mean', 'std']
      )
  # Sort the values for better visualization 
  df_grouped = df_grouped.sort_values(by='mean')
  return df_grouped.reset_index()

In [None]:
words_info = get_words_per_line_features(
    df_speakers_splited, 
    main_characters, 
    stop_words)
words_info.tail()

Unnamed: 0,speaker,max,min,count,median,mean,std
30,Robert,68,1,431,5.0,8.092807,8.922703
31,Michael,106,1,11624,5.0,8.176875,8.993016
32,Deangelo,75,1,171,6.0,8.187135,8.859031
33,Jo,45,1,194,6.0,8.427835,8.312388
34,DeAngelo,93,1,79,6.0,10.924051,14.85552


In [None]:
fig = make_subplots(rows=1, cols=2)

fig.add_trace(
    go.Bar(
      name='Mean',
      x=words_info['mean'].to_list(),
      y=words_info.speaker.to_list(),
      orientation='h'),
    row=1, col=1
)

fig.add_trace(
    go.Bar(
      name='Std',
      x=words_info['std'].to_list(),
      y=words_info.speaker.to_list(),
      orientation='h'),
    row=1, col=2
)

fig.update_layout(height=1000, width=1500, title_text='Mean and standar deviation in words per line for each character')
fig.show()

In [None]:
words_info[words_info.speaker == 'deangelo']

Unnamed: 0,speaker,max,min,count,median,mean,std
34,DeAngelo,93,1,79,6.0,10.924051,14.85552


We can see that DeAngelo has the bigger mean in words per line, but genearly because he has a line with 93 words.

This was expected given that he also has the biggest standar deviation and not as many lines as other characters. (has 79)

## 4 What is the most common word per character

In [None]:
from scipy import stats as s

def get_most_commond_words(df, main_characters, stop_words):
  # Keep just main characters
  df = df[df.speaker.isin(main_characters)]
  # Clean punctuation
  list_words_line = df.line_text.apply(clean_punctuation)
  
  # Lower case
  list_words_line = list_words_line.apply(
      lambda row: row.lower())
  # Get amount of words per line
  list_words_line =  list_words_line.apply(
    lambda row: [word for word in re.findall(r'[\w']+', row) if word not in (stop_words)]
  )
  df = df.assign(list_words_line=list_words_line)

  # Group by spealer and get stats of the amount of words
  df_grouped = df.groupby('speaker') \
    .agg({'list_words_line': 'sum'}) 
  
  #Calculate the mode
  common_word = df_grouped.list_words_line.apply(lambda row: s.mode(row)[0][0])
  df_grouped = df_grouped.assign(common_word=common_word)
  return df_grouped.reset_index()

In [None]:
common_words = get_most_commond_words(df_speakers_splited, main_characters, stop_words)
common_words[['speaker', 'common_word']]

Unnamed: 0,speaker,common_word
0,Andy,yeah
1,Angela,dwight
2,Carol,michael
3,Charles,michael
4,Clark,hey
5,Creed,creed
6,Darryl,yeah
7,David,michael
8,David Wallace,dwight
9,DeAngelo,michael


We have the most common word per character. Is intresting that the most common word in the most common word group is "michael"

In [None]:
common_words.common_word.value_counts()

michael    11
yeah        9
dwight      3
hey         3
andy        2
uh          2
good        2
creed       1
jim         1
ryan        1
Name: common_word, dtype: int64

## 5 Number of episodes where the character does not have a line, for each character

In [None]:
total_amount_of_episodes = sum(get_dict_episodes_per_season(df).values())

We can 

In [None]:
def appearaence_episodes(df):
  df = df[df.speaker.isin(main_characters)]
  speakers_amount = df \
    .drop(['line_text', 'index'], axis=1) \
    .drop_duplicates(['season','episode','speaker'],keep='first')

  speakers_amount = speakers_amount.groupby(['speaker']).episode.count()
  speakers_amount = speakers_amount.reset_index(name='episodes_appearance')
  speakers_amount = speakers_amount.sort_values(by='episodes_appearance')
  return speakers_amount

for row in appearaence_episodes(df_speakers_splited).iloc:
  not_app = total_amount_of_episodes - row.episodes_appearance
  not_app_per = 100*not_app/total_amount_of_episodes
  not_app_per = round(not_app_per)
  print(f'{row.speaker}:')
  print(f'Not appear in {not_app} episodes ({not_app_per}%)')
  print()

katy:
Not appear in 183 episodes (98%)

donna:
Not appear in 183 episodes (98%)

deangelo:
Not appear in 182 episodes (98%)

carol:
Not appear in 180 episodes (97%)

charles:
Not appear in 179 episodes (96%)

jo:
Not appear in 177 episodes (95%)

val:
Not appear in 175 episodes (94%)

david wallace:
Not appear in 170 episodes (91%)

holly:
Not appear in 169 episodes (91%)

clark:
Not appear in 166 episodes (89%)

robert:
Not appear in 165 episodes (89%)

pete:
Not appear in 164 episodes (88%)

karen:
Not appear in 160 episodes (86%)

david:
Not appear in 158 episodes (85%)

roy:
Not appear in 157 episodes (84%)

nellie:
Not appear in 153 episodes (82%)

jan:
Not appear in 145 episodes (78%)

gabe:
Not appear in 139 episodes (75%)

erin:
Not appear in 88 episodes (47%)

darryl:
Not appear in 81 episodes (44%)

toby:
Not appear in 73 episodes (39%)

michael:
Not appear in 49 episodes (26%)

creed:
Not appear in 48 episodes (26%)

meredith:
Not appear in 46 episodes (25%)

ryan:
Not appea

## 6.1 Number of times "That's what she said" joke comes up

In [None]:
joke_df = df[df['line_text'].str.contains('that's what she', na=False, case=False)]
print(f'Amount of times "that what she said" joke: {len(joke_df)}')

Amount of times 'that what she said' joke: 40


### 6.2 Include five examples of the joke

In [None]:
for row in joke_df.head().iloc:
  print(row.speaker, ':')
  print(row.line_text)
  print()

Michael :
That's what she said. Pam?

Michael :
That's what she sai [clears throat]  Nope, but... Okay. Well, suit yourself.

Jim :
Does that include 'That's What She Said'?

Michael :
THAT'S WHAT SHE SAID!

Michael :
A, that's what she said, and B, I wanted it to be impressive. The biggest day of the year deserves the biggest tree of the year.



Characters that said the joke

In [None]:
joke_df.speaker.value_counts()

Michael                27
Dwight                  3
Jim                     2
Stanley                 1
Jan                     1
Everyone                1
Deposition Reporter     1
Pam                     1
David                   1
Holly                   1
Creed                   1
Name: speaker, dtype: int64

## 7 The average percent of lines each character contributed to each episode per season.



In [None]:
scenes_per_episode_season = get_dict_scenes_per_episode(df)

In [None]:
def percentage_scenes_episode(row):
  divider = scenes_per_episode_season[(row.season, row.episode)]
  return 100*row.scenes/divider

def appearaence_episodes(df):
  df = df[df.speaker.isin(main_characters)]
  speakers_amount = df \
    .drop(['index'], axis=1) \

  speakers_amount = speakers_amount.groupby(['speaker', 'season','episode']).line_text.count()
  speakers_amount = speakers_amount.reset_index(name='scenes')

  perc_scenes = speakers_amount.apply(percentage_scenes_episode, axis=1)
  speakers_amount = speakers_amount.assign(percentage_in_scene=perc_scenes)

  speakers_amount = speakers_amount.groupby(['speaker', 'season','episode']).percentage_in_scene.mean()
  speakers_amount = speakers_amount.reset_index(name='precentage_lines_in_scene')
  speakers_amount = speakers_amount.groupby(['speaker']).precentage_lines_in_scene.agg('mean')
  return speakers_amount


The mean percentage for each character in the hole serie

In [None]:
perce = appearaence_episodes(df_speakers_splited).sort_values(ascending=False)
for speaker in perce.index:
  print(f'{speaker} participation: { round(perce[speaker]) }%')

michael participation: 27%
deangelo participation: 24%
dwight participation: 13%
jim participation: 11%
charles participation: 11%
holly participation: 11%
pam participation: 9%
andy participation: 9%
katy participation: 9%
donna participation: 9%
jo participation: 8%
jan participation: 7%
robert participation: 7%
nellie participation: 6%
erin participation: 5%
clark participation: 4%
david participation: 4%
carol participation: 4%
karen participation: 4%
darryl participation: 4%
gabe participation: 3%
pete participation: 3%
kevin participation: 3%
angela participation: 3%
ryan participation: 3%
oscar participation: 3%
roy participation: 3%
toby participation: 3%
val participation: 2%
kelly participation: 2%
david wallace participation: 2%
phyllis participation: 2%
meredith participation: 1%
stanley participation: 1%
creed participation: 1%


## Q1 Amounts of scenes for each episode along the seasons

First we need the amount of episodes per season (to plot the graph with colors)

In [None]:
dic_ep_per_season = get_dict_episodes_per_season(df)

In [None]:
df_scenes = df[['season', 'episode', 'scene']]
df_scenes = df_scenes.drop_duplicates(['season','episode','scene'], keep='first')
df_scenes = df_scenes.groupby(['season', 'episode']).scene.count().reset_index(name='scenes')

ep_norm = []
for row in df_scenes.iloc:
  ep_norm.append(row.episode/dic_ep_per_season[row.season])
      
# df_scenes.season.apply(episode_normalized)
df_scenes = df_scenes.assign(episodes_normalized=ep_norm)

In [None]:
df_scenes.head(10)

Unnamed: 0,season,episode,scenes,episodes_normalized
0,1,1,39,0.166667
1,1,2,35,0.333333
2,1,3,39,0.5
3,1,4,49,0.666667
4,1,5,40,0.833333
5,1,6,36,1.0
6,2,1,62,0.045455
7,2,2,45,0.090909
8,2,3,50,0.136364
9,2,4,53,0.181818


In [None]:
fig = px.scatter(df_scenes, y='scenes', x='season', color='episodes_normalized')
fig.update_traces(marker_size=10)
fig.show()

Something that I expected was that the last episode should have more scenes. It happend in seasons 3, 4, 7 and 9. Witch is the half, but nota as much as I was expecting.

## Q2 Sentiment analisis on lines for the 5 most participative characters


In [None]:
# Install Libraries
!pip install textblob
!pip install langdetect
!pip install -U tqdm

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from langdetect import detect
from nltk.stem import SnowballStemmer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import CountVectorizer
from textblob import TextBlob
import nltk
from collections import defaultdict
from tqdm import tqdm

nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [None]:
small_main_char = main_characters[:5]
df_sentiment = df[df.speaker.isin(small_main_char)]

In [None]:
#Sentiment Analysis

neutral_dict = defaultdict(lambda: 0)
negative_dict = defaultdict(lambda: 0)
positive_dict = defaultdict(lambda: 0)
counter = 0
for row in df_sentiment.iloc:
    #print(tweet.text)
    analysis = TextBlob(row.line_text)
    score = SentimentIntensityAnalyzer().polarity_scores(row.line_text)
    neg = score['neg']
    neu = score['neu']
    pos = score['pos']
    
    if neg > pos:
        negative_dict[row.speaker] += 1

    elif pos > neg:
        positive_dict[row.speaker] += 1
    
    elif pos == neg:
        neutral_dict[row.speaker] += 1
    counter += 1


In [None]:
import plotly.express as px

results = []
for char in small_main_char:
  results.append(
    {
    "speaker": char, 
    "sentiment": "positive",
    "amount": positive_dict[char], 
    })
  results.append(
    {
    "speaker": char, 
    "sentiment": "negative",
    "amount": negative_dict[char], 
    })
  results.append(
    {
    "speaker": char, 
    "sentiment": "neutral",
    "amount": neutral_dict[char], 
    })

  
results = pd.DataFrame(results)
fig = px.bar(results, x="amount", y="speaker", color="sentiment", title="Sentiment analisis per cgaracter", orientation="h")
fig.show()

In [None]:
px.data.medals_long()


Unnamed: 0,nation,medal,count
0,South Korea,gold,24
1,China,gold,10
2,Canada,gold,9
3,South Korea,silver,13
4,China,silver,15
5,Canada,silver,12
6,South Korea,bronze,11
7,China,bronze,8
8,Canada,bronze,12
