In [19]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [20]:
df = pd.read_csv('data/df_final_full.csv')

### Qual a quantidade total de livros da base?

In [21]:
df['book_id'].nunique()

20216

### Qual a quantidade de livros que possuí apenas 1 autor?

In [22]:
books = df.groupby(['book_id'])

result = books['author_name'].nunique() == 1
result.value_counts()

# Contagem de "True" sao os livros que possuem apenas 1 autor

True     14753
False     5463
Name: author_name, dtype: int64

### Quais os 5 autores com a maior quantidade de livros?

In [23]:
df.groupby('author_name')['book_id'].nunique().sort_values(ascending = False).head(5)
## A coluna book_id se repete, por isso o uso da função nunique(), que conta valores distintos da mesma.

author_name
Weezag                 258
Rand McNally           212
William Shakespeare    117
Matthew Preston         83
Kevin Lee               83
Name: book_id, dtype: int64

In [24]:
len(df.loc[df['author_name'] == 'Weezag'].groupby('book_id').nunique()) ## conferindo

258

### Qual a quantidade de livros por categoria?

In [25]:
category = df.groupby(['category_name'])
category['book_id'].nunique().to_csv('qtd_livros_categoria.csv')
category['book_id'].nunique()

category_name
20th Century & Contemporary Classical Music     2
20th Century History: C 1900  To C 2000        92
21st Century History: From C 2000 -            19
ABC                                             2
Abnormal Psychology                            17
                                               ..
Zoology & Animal Sciences                      50
Zoology: Invertebrates                          3
Zoology: Mammals                                8
Zoos & Wildlife Parks                           1
Zoroastrianism                                  3
Name: book_id, Length: 1731, dtype: int64

### Quais as 5 categorias com a maior quantidade de livros?

In [26]:
category = df.groupby(['category_name'])
category['book_id'].nunique().sort_values(ascending = False).head(5)

category_name
Television                      2225
Teaching Skills & Techniques    2061
World Atlases / World Maps      1896
Usage & Grammar Guides          1862
Theatre Studies                 1854
Name: book_id, dtype: int64

### Qual o formato com a maior quantidade de livros?

In [12]:
formats = df.groupby(['format_name'])
formats['book_id'].nunique().sort_values(ascending = False).head(5)

format_name
Paperback    13497
Hardback      4955
Book           463
Calendar       303
CD             227
Name: book_id, dtype: int64

### Considerando a coluna “bestsellers-rank”, quais os 10 livros mais bem posicionados?

In [13]:
df.groupby(['title', 'bestsellers-rank']).count().sort_values(by = 'bestsellers-rank', ascending = False).head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 0,book_id,category_name,author_name,format_name,rating-avg,publication-date
title,bestsellers-rank,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
After the Break : Television Theory Today,3586486.0,10,10,10,10,10,10,10
On Latin Adverbs,3579654.0,5,5,5,5,5,5,5
Feeling Asian Modernities - Transnational Consumption of Japanese TV Dramas,3569736.0,6,6,6,6,6,6,6
"Lexikon der alteren germanischen Lehnwoerter in den ostseefinnischen Sprachen, Band III : P-AE",3567995.0,12,12,12,12,12,12,12
The Real (Arab) World: v. 2,3567310.0,5,5,5,5,5,5,5
Multi-verb Constructions : A View from the Americas,3565119.0,12,12,12,12,12,12,12
The Noun Phrase in Ancient Greek : A Functional Analysis of the Order and Articulation of NP Constituents in Herodotus,3564946.0,2,2,2,2,2,2,2
Ghost Journal,3555958.0,4,4,4,4,4,4,4
Handbook of Generative Approaches to Language Acquisition,3546984.0,10,10,10,10,10,10,10
Ghost Stories of Arizona and New Mexico,3546676.0,3,3,3,3,3,3,3


### Considerando a coluna “rating-avg”, quais os 10 livros mais bem posicionados?

In [14]:
df.groupby(['title', 'rating-avg']).count().sort_values(by = 'rating-avg', ascending = False).head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 0,book_id,category_name,author_name,format_name,bestsellers-rank,publication-date
title,rating-avg,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
The Early Shows : A Reference Guide to Network and Syndicated PrimeTime Television Series from 1944 to 1949,5.0,3,3,3,3,3,3,3
The Learning and Teaching of Reading and Writing,5.0,18,18,18,18,18,18,18
Ask the Psychic Medium,5.0,5,5,5,5,5,5,5
Collective Consciousness : How to Transcend Mass Consciousness and Become One With the Universe,5.0,4,4,4,4,4,4,4
Self-Regulation in Education,5.0,13,13,13,13,13,13,13
Improvised Explosives & Munitions,5.0,1,1,1,1,1,1,1
"Selected Writings : On Self-Organization, Philosophy, Bioethics, and Judaism",5.0,15,15,15,15,15,15,15
"Asia Classic, Tubed : Wall Maps Continents",5.0,6,6,6,6,6,6,6
Practicing Music by Design : Historic Virtuosi on Peak Performance,5.0,9,9,9,9,9,9,9
"A Reading Book of the Turkish Language, with a Grammar and Vocabulary",5.0,2,2,2,2,2,2,2


### Quantos livros possuem “rating-avg” maior do que 3,5?

In [15]:
len(df['book_id'].loc[df['rating-avg']  > 3.5].unique())

8182

### Quantos livros tem data de publicação (publication-date) maior do que 01-01-2020?

In [16]:
df.head(10)

Unnamed: 0.1,Unnamed: 0,book_id,title,category_name,author_name,format_name,bestsellers-rank,rating-avg,publication-date
0,0,0,We,Contemporary Fiction,Yevgeny Zamyatin,Paperback,2038.0,3.91,1993-11-25 00:00:00
1,1,0,We,Contemporary Fiction,Clarence Brown,Paperback,2038.0,3.91,1993-11-25 00:00:00
2,2,365,What Haunts Us,Contemporary Fiction,Loren Niemi,Paperback,0.0,4.5,2019-02-12 00:00:00
3,3,462,Alpine Skiing,Contemporary Fiction,John Yacenda,Paperback,0.0,0.0,1992-02-01 00:00:00
4,4,590,Snowboarding,Contemporary Fiction,Anna Marie Bruechert,Paperback,0.0,3.0,1994-12-01 00:00:00
5,5,590,Snowboarding,Contemporary Fiction,Rob Reichenfeld,Paperback,0.0,3.0,1994-12-01 00:00:00
6,6,615,Doctor Who: Plague City,Contemporary Fiction,Jonathan Morris,Paperback,0.0,3.89,2020-12-17 00:00:00
7,7,865,Una Nueva Entrega : Charla Sencilla Para Tiemp...,Contemporary Fiction,Lee Carroll,Paperback,0.0,4.5,2010-04-19 00:00:00
8,8,947,El Sanador,Contemporary Fiction,John Diamond,Paperback,0.0,0.0,2003-07-01 00:00:00
9,9,965,Precious Little,Contemporary Fiction,Madeleine George,Paperback,1762612.0,3.83,2013-02-06 00:00:00


In [17]:
df['publication-date'] = pd.to_datetime(df['publication-date'], errors='coerce')

In [18]:
df['book_id'].loc[df['publication-date']  > '01-01-2020'].nunique()

1172