In [1]:
import numpy as np
import pandas as pd

# Data Visualization of an Anime-Based DataFrame

In [39]:
df = pd.read_csv(r'anime.csv')
df.head()  # For first 5 values of dataset, easier to visualize when dealing with such big data

Unnamed: 0,Rank,Title,Score
0,1,Fullmetal Alchemist: BrotherhoodTV (64 eps)Apr...,9.1
1,2,"Steins;GateTV (24 eps)Apr 2011 - Sep 20112,473...",9.07
2,3,Bleach: Sennen Kessen-henTV (13 eps)Oct 2022 -...,9.06
3,4,"Gintama°TV (51 eps)Apr 2015 - Mar 2016605,113 ...",9.06
4,5,Shingeki no Kyojin Season 3 Part 2TV (10 eps)A...,9.05


### Tasks in Feature Extraction Projest (Anime-based data)

- Make a new column for episode count
- Make a new column for time stamp
- Which anime has the highest score
- Gimme top 5 highest scoring anime
- Which anime has highest episode count
- Anime with top 5 episode count
- Which is the longest running anime

In [6]:
df.loc[1]['Title']

'Steins;GateTV (24 eps)Apr 2011 - Sep 20112,473,707 members'

### Here we will learn FEATURE EXTRACTION

#### Extracting episodes count

In [19]:
def extract_episodes(txt):
    a = txt.find('(')
    b = txt.find(')')
    data = txt[a+1:b]
            
    return data


In [51]:
df['Title'].apply(extract_episodes).head()

0    64 eps
1    24 eps
2    13 eps
3    51 eps
4    10 eps
Name: Title, dtype: object

In [40]:
df['Episodes Count'] = df['Title'].apply(extract_episodes)


In [41]:
df['Episodes Count'] = df['Episodes Count'].str.replace(' eps', '')

In [42]:
df['Episodes Count'] = df['Episodes Count'].astype(int)

In [43]:
df.head()

Unnamed: 0,Rank,Title,Score,Episodes Count
0,1,Fullmetal Alchemist: BrotherhoodTV (64 eps)Apr...,9.1,64
1,2,"Steins;GateTV (24 eps)Apr 2011 - Sep 20112,473...",9.07,24
2,3,Bleach: Sennen Kessen-henTV (13 eps)Oct 2022 -...,9.06,13
3,4,"Gintama°TV (51 eps)Apr 2015 - Mar 2016605,113 ...",9.06,51
4,5,Shingeki no Kyojin Season 3 Part 2TV (10 eps)A...,9.05,10


#### Extracting time stamp

In [44]:
def extract_time(txt):
    a = txt.find(')')
    return txt[a + 1 : a + 20]


In [45]:
df['Time Stamp'] = df['Title'].apply(extract_time)

In [46]:
df.head()

Unnamed: 0,Rank,Title,Score,Episodes Count,Time Stamp
0,1,Fullmetal Alchemist: BrotherhoodTV (64 eps)Apr...,9.1,64,Apr 2009 - Jul 2010
1,2,"Steins;GateTV (24 eps)Apr 2011 - Sep 20112,473...",9.07,24,Apr 2011 - Sep 2011
2,3,Bleach: Sennen Kessen-henTV (13 eps)Oct 2022 -...,9.06,13,Oct 2022 - Dec 2022
3,4,"Gintama°TV (51 eps)Apr 2015 - Mar 2016605,113 ...",9.06,51,Apr 2015 - Mar 2016
4,5,Shingeki no Kyojin Season 3 Part 2TV (10 eps)A...,9.05,10,Apr 2019 - Jul 2019


### For extracting number of months from the time stamp

In [48]:
from dateutil.relativedelta import relativedelta
from datetime import datetime

def calculate_total_months(period):
    try:
        start_str, end_str = period.split(' - ')
        start_date = datetime.strptime(start_str, '%b %Y')
        end_date = datetime.strptime(end_str, '%b %Y')
        r = relativedelta(end_date, start_date)
        return r.years * 12 + r.months + 1  # +1 to include the starting month
    except:
        return None



In [49]:
df['Months'] = df['Time Stamp'].apply(calculate_total_months)

In [50]:
df.head()

Unnamed: 0,Rank,Title,Score,Episodes Count,Time Stamp,Months
0,1,Fullmetal Alchemist: BrotherhoodTV (64 eps)Apr...,9.1,64,Apr 2009 - Jul 2010,16
1,2,"Steins;GateTV (24 eps)Apr 2011 - Sep 20112,473...",9.07,24,Apr 2011 - Sep 2011,6
2,3,Bleach: Sennen Kessen-henTV (13 eps)Oct 2022 -...,9.06,13,Oct 2022 - Dec 2022,3
3,4,"Gintama°TV (51 eps)Apr 2015 - Mar 2016605,113 ...",9.06,51,Apr 2015 - Mar 2016,12
4,5,Shingeki no Kyojin Season 3 Part 2TV (10 eps)A...,9.05,10,Apr 2019 - Jul 2019,4


In [53]:
# anime with highest score
highest_score = df['Score'].max()
highest_score

9.1

In [None]:
# Top 5 highest scoring anime
top5 = df['Score'].sort_values(ascending = False).head()
top5

0    9.10
1    9.07
2    9.06
3    9.06
4    9.05
Name: Score, dtype: float64


In [61]:
# anime with highest eisode count
highest_epCount = df['Episodes Count'].max()
highest_epCount

201

In [63]:
# Top 5 highest scoring anime
top5 = df['Episodes Count'].nlargest(5)
top5

15    201
7     148
11    110
42     75
24     74
Name: Episodes Count, dtype: int32

In [67]:
MostRun = df['Months'].max()
df.loc[df['Months'] == MostRun]

Unnamed: 0,Rank,Title,Score,Episodes Count,Time Stamp,Months
11,12,Ginga Eiyuu DensetsuOVA (110 eps)Jan 1988 - Ma...,9.02,110,Jan 1988 - Mar 1997,111
