In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import plotly.express as px

In [None]:
df=pd.read_csv("/content/drive/MyDrive/Colab Notebooks/tcc/amazon_prime_titles.csv")
df.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,The Grand Seduction,Don McKellar,"Brendan Gleeson, Taylor Kitsch, Gordon Pinsent",Canada,"March 30, 2021",2014,,113 min,"Comedy, Drama",A small fishing village must procure a local d...
1,s2,Movie,Take Care Good Night,Girish Joshi,"Mahesh Manjrekar, Abhay Mahajan, Sachin Khedekar",India,"March 30, 2021",2018,14+,110 min,"Drama, International",A Metro Family decides to fight a Cyber Crimin...
2,s3,Movie,Secrets of Deception,Josh Webber,"Tom Sizemore, Lorenzo Lamas, Robert LaSardo, R...",United States,"March 30, 2021",2017,,74 min,"Action, Drama, Suspense",After a man discovers his wife is cheating on ...
3,s4,Movie,Pink: Staying True,Sonia Anderson,"Interviews with: Pink, Adele, Beyoncé, Britney...",United States,"March 30, 2021",2014,,69 min,Documentary,"Pink breaks the mold once again, bringing her ..."
4,s5,Movie,Monster Maker,Giles Foster,"Harry Dean Stanton, Kieran O'Brien, George Cos...",United Kingdom,"March 30, 2021",1989,,45 min,"Drama, Fantasy",Teenage Matt Banting wants to work with a famo...


In [None]:
df.shape

(9668, 12)

In [None]:
df.columns

Index(['show_id', 'type', 'title', 'director', 'cast', 'country', 'date_added',
       'release_year', 'rating', 'duration', 'listed_in', 'description'],
      dtype='object')

In [None]:
df.isnull().sum() #celulas vazias

show_id            0
type               0
title              0
director        2082
cast            1233
country         8996
date_added      9513
release_year       0
rating           337
duration           0
listed_in          0
description        0
dtype: int64

In [None]:
df=df.fillna('Not specified') #celula vazia = Not specified
df.to_csv('amazon.csv')

In [None]:
df.nunique(axis=0) #valores unicos

show_id         9668
type               2
title           9668
director        5775
cast            7928
country           87
date_added        85
release_year     100
rating            25
duration         219
listed_in        518
description     9414
dtype: int64

De cima, podemos notar que existem 2 tipos de conteúdo. E há vários países, elenco, diretores.

Agora vamos ver os principais (Top) diretores, principais (Top) nomes do elenco.
Isso pode ser feito visualmente usando gráficos de barras.

In [None]:
#top 5 directors
director_name=pd.DataFrame()
director_name=df['director'].str.split(',',expand=True).stack()
director_name=director_name.to_frame()
director_name.columns=['Director']
directors=director_name.groupby(['Director']).size().reset_index(name='Total Content')
directors=directors[directors.Director !='Not specified']
directors=directors.sort_values(by=['Total Content'],ascending=False)
directorsTop5=directors.head()
directorsTop5=directorsTop5.sort_values(by=['Total Content'])
fig1=px.bar(directorsTop5,x='Total Content',y='Director',title='Top 5 Directors on Amazon')
fig1.show()

In [None]:
#top 5 actors
cast_name=pd.DataFrame()
cast_name=df['cast'].str.split(',',expand=True).stack()
cast_name=cast_name.to_frame()
cast_name.columns=['Actor']
actors=cast_name.groupby(['Actor']).size().reset_index(name='Total Content')
actors=actors[actors.Actor !='Not specified']
actors=actors.sort_values(by=['Total Content'],ascending=False)
actorsTop5=actors.head()
actorsTop5=actorsTop5.sort_values(by=['Total Content'])
fig2=px.bar(actorsTop5,x='Total Content',y='Actor', title='Top 5 Actors on Amazon')
fig2.show()

Agora vamos analisar a distribuição de várias classificações. Para isso, podemos usar o gráfico de pizza.

In [None]:
p=df.groupby(['rating']).size().reset_index(name='counts')
piechart=px.pie(p,values='counts',names='rating',title='Ratings of different contents on Amazon')
piechart.show()

Agora vamos comparar o número de programas de TV e filmes lançados ao longo dos anos após 2010.

In [None]:
df1=df[['type','release_year']]
df1=df1.rename(columns={"release_year": "Release Year"})
df2=df1.groupby(['Release Year','type']).size().reset_index(name='Total Content')
df2=df2[df2['Release Year']>=2010]
fig3 = px.line(df2, x="Release Year", y="Total Content", color='type',title='Trend of content produced over the years on Amazon')
fig3.show()

Para a análise de sentimento das avaliações, a biblioteca TextBlob do Python pode ser usada

Basicamente TextBlob é uma biblioteca para processamento de dados textuais.

In [None]:
from textblob import TextBlob

dfx=df[['release_year','description']]
dfx=dfx.rename(columns={'release_year':'Release Year'})
for index,row in dfx.iterrows():
    z=row['description']
    testimonial=TextBlob(z)
    p=testimonial.sentiment.polarity
    if p==0:
        sent='Neutral'
    elif p>0:
        sent='Positive'
    else:
        sent='Negative'
    dfx.loc[[index,2],'Sentiment']=sent


dfx=dfx.groupby(['Release Year','Sentiment']).size().reset_index(name='Total Content')

dfx=dfx[dfx['Release Year']>=2010]
fig4 = px.bar(dfx, x="Release Year", y="Total Content", color="Sentiment", title="Sentiment of content on Amazon")
fig4.show()

No gráfico de barras acima, podemos ver o sentimento (positivo/neutro/negativo) das descrições ao longo dos anos.