# Team 3 - Final Project Data Processing

In [2]:
!pip install pyathena

  from cryptography.utils import int_from_bytes
  from cryptography.utils import int_from_bytes
You should consider upgrading via the '/opt/conda/bin/python -m pip install --upgrade pip' command.[0m


In [3]:
import boto3
import sagemaker
import pyathena as pa
from pyathena.pandas.cursor import PandasCursor
from datetime import datetime

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import OneHotEncoder

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
sess = sagemaker.Session()
bucket = sess.default_bucket()
role = sagemaker.get_execution_role()
region = boto3.Session().region_name

In [5]:
# Athena modeling bucket
s3_modeling_dir = 's3://ads508team3/modeling/'
# Athena staging bucket
s3_staging_dir = 's3://ads508team3/athena-staging/'

In [6]:
# Create Connection
conn = pa.connect(region_name=region, s3_staging_dir=s3_staging_dir)

In [7]:
dbname = 'myanimelist'
table_list = ['animelist', 'anime', 'synopsis', 'ratings', 'watching_status']

### Construct DataFrames

**Bypassing the default pyathena cursor and using a more efficient PandasCursor**

In [8]:
cursor = pa.connect(s3_staging_dir=s3_staging_dir, region_name='us-east-1',cursor_class=PandasCursor).cursor()

In [9]:
anime_list = cursor.execute('SELECT * FROM %s.%s' % (dbname, 'animelist')).as_pandas()
anime = cursor.execute('SELECT * FROM %s.%s' % (dbname, 'anime')).as_pandas()
#synopsis = cursor.execute('SELECT * FROM %s.%s' % (dbname, 'synopsis')).as_pandas()
#ratings = cursor.execute('SELECT * FROM %s.%s' % (dbname, 'ratings')).as_pandas()
watching_status = cursor.execute('SELECT * FROM %s.%s' % (dbname, 'watching_status')).as_pandas()

# Data Preparation

### Fix anime scores, replace 'Unknown' values with np.nan and then mean impute

In [10]:
anime.dtypes

mal_id            Int64
name             object
score            object
genres           object
english_name     object
japanese_name    object
type             object
episodes         object
aired            object
premiered        object
producers        object
licensors        object
studios          object
source           object
duration         object
rating           object
ranked           object
popularity        Int64
members           Int64
favorites         Int64
watching          Int64
completed         Int64
on_hold           Int64
dropped           Int64
plan_to_watch    object
score_10         object
score_9          object
score_8          object
score_7          object
score_6          object
score_5          object
score_4          object
score_3          object
score_2          object
score_1          object
dtype: object

In [12]:
# Replace Unknown Values
anime['score'] = anime['score'].replace({'Unknown': np.nan})
anime['score'] = anime['score'].astype('float')

In [15]:
anime['score'].dtypes

dtype('float64')

In [16]:
anime['score'].isna().sum()

5141

In [17]:
# We may have to change this imputation method if it impacts the model too much
anime['score'] = anime['score'].fillna(anime['score'].mean())

In [18]:
anime['score']

0        6.120000
1        6.970000
2        5.960000
3        5.740000
4        6.010000
           ...   
17557    6.509999
17558    6.509999
17559    6.509999
17560    6.509999
17561    6.509999
Name: score, Length: 17562, dtype: float64

### Feature Days Since Aired
Pull the first date from Aired, convert to datetime and get timedelta in days

In [19]:
anime['days_since_aired'] = anime['aired'].str.split(' to ', expand=True).iloc[:, 0]

In [20]:
# Remove spaces left and right
anime['days_since_aired'] = anime['days_since_aired'].str.strip()

In [21]:
# Fix date format issues
anime['days_since_aired'] = anime['days_since_aired'].str.replace('^\w{3}, \d{4}', lambda x: x[0][0:3] + ' 1, ' + x[0][5:9], regex=True)
anime['days_since_aired'] = anime['days_since_aired'].str.replace('^\d{4}', lambda x: 'Jan 1, ' + x[0], regex=True)
anime['days_since_aired']

0        Jan 11, 2002
1        Jul 25, 1995
2         Apr 7, 1999
3        Dec 21, 1989
4        Aug 21, 1999
             ...     
17557     Apr 4, 2021
17558     Jan 1, 2021
17559     Jul 1, 2021
17560         Unknown
17561     Jul 1, 2021
Name: days_since_aired, Length: 17562, dtype: object

In [22]:
anime['days_since_aired'] = anime['days_since_aired'].replace('Unknown', np.nan)
anime['days_since_aired']

0        Jan 11, 2002
1        Jul 25, 1995
2         Apr 7, 1999
3        Dec 21, 1989
4        Aug 21, 1999
             ...     
17557     Apr 4, 2021
17558     Jan 1, 2021
17559     Jul 1, 2021
17560             NaN
17561     Jul 1, 2021
Name: days_since_aired, Length: 17562, dtype: object

In [23]:
anime['days_since_aired'] = pd.to_datetime(anime['days_since_aired'], format='%b %d, %Y')

In [24]:
anime['days_since_aired'] = anime['days_since_aired'].fillna(pd.datetime.now())

  """Entry point for launching an IPython kernel.


In [25]:
anime['days_since_aired']

0       2002-01-11 00:00:00.000000
1       1995-07-25 00:00:00.000000
2       1999-04-07 00:00:00.000000
3       1989-12-21 00:00:00.000000
4       1999-08-21 00:00:00.000000
                   ...            
17557   2021-04-04 00:00:00.000000
17558   2021-01-01 00:00:00.000000
17559   2021-07-01 00:00:00.000000
17560   2022-03-31 18:19:05.994792
17561   2021-07-01 00:00:00.000000
Name: days_since_aired, Length: 17562, dtype: datetime64[ns]

### Get Time Delta in days

In [26]:
anime['days_since_aired'] = (pd.datetime.now() - anime['days_since_aired']).dt.days

  """Entry point for launching an IPython kernel.


In [27]:
anime['days_since_aired']

0         7384
1         9746
2         8394
3        11788
4         8258
         ...  
17557      361
17558      454
17559      273
17560        0
17561      273
Name: days_since_aired, Length: 17562, dtype: int64

# One Hot Encode Genres