# Team 3 - Final Project EDA

In [6]:
#!pip install pyathena

  from cryptography.utils import int_from_bytes
  from cryptography.utils import int_from_bytes
Collecting pyathena
  Downloading PyAthena-2.5.1-py3-none-any.whl (40 kB)
     |████████████████████████████████| 40 kB 720 kB/s             
Installing collected packages: pyathena
Successfully installed pyathena-2.5.1
You should consider upgrading via the '/opt/conda/bin/python -m pip install --upgrade pip' command.[0m


In [99]:
import boto3
import sagemaker
import pyathena as pa

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [100]:
sess = sagemaker.Session()
bucket = sess.default_bucket()
role = sagemaker.get_execution_role()
region = boto3.Session().region_name

In [101]:
# Athena staging bucket
s3_staging_dir = 's3://ads508team3/athena-staging/'

## Create myanimelist Database

In [102]:
# Create Database
conn = pa.connect(region_name=region, s3_staging_dir=s3_staging_dir)

In [103]:
# Create Database and verify
pd.read_sql('CREATE DATABASE IF NOT EXISTS myanimelist', con=conn)
pd.read_sql('SHOW DATABASES', con=conn)

Unnamed: 0,database_name
0,default
1,myanimelist


## Create Athena Tables and Ingesting Data
 * animelist
 * anime
 * anime_with_synopsis
 * rating_complete
 * watching_status


In [104]:
dbname = 'myanimelist'

In [112]:
animelist_table = """CREATE EXTERNAL TABLE IF NOT EXISTS {}.{}(
         user_id integer,
         anime_id integer,
         rating integer,
         watching_status integer,
         watched_episodes integer
) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LINES TERMINATED BY '\\n' LOCATION '{}'
TBLPROPERTIES ('compressionType'='gzip', 'skip.header.line.count'='1')""".format(dbname, 'animelist', 's3://ads508-raw-data/animelist/')
pd.read_sql(animelist_table, con=conn)

In [119]:
anime_table = """CREATE EXTERNAL TABLE IF NOT EXISTS {}.{}(
         MAL_ID int, Name string, Score int, Genres string, English_name string, Japanese_name string, Type string, 
         Episodes int, Aired string, Premiered string, Producers string, 
         Licensors string, Studios string, Source string, Duration string, Rating string, Ranked int,
         Popularity int, Members int, Favorites int, Watching int, Completed int, 
         On_Hold int, Dropped int, Plan_to_Watch int, 
         Score_10 int, Score_9 int, Score_8 int, Score_7 int, Score_6 int, Score_5 int, Score_4 int, Score_3 int, Score_2 int, Score_1 int
) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LINES TERMINATED BY '\\n' LOCATION '{}'
TBLPROPERTIES ('compressionType'='gzip', 'skip.header.line.count'='1')""".format(dbname, 'anime', 's3://ads508-raw-data/anime/')
pd.read_sql(anime_table, con=conn)

In [121]:
synopsis_table = """CREATE EXTERNAL TABLE IF NOT EXISTS {}.{}(
         MAL_ID int,
         Name string,
         Score float,
         Genres string,
         sypnopsis string
) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LINES TERMINATED BY '\\n' LOCATION '{}'
TBLPROPERTIES ('compressionType'='gzip', 'skip.header.line.count'='1')""".format(dbname, 'synopsis', 's3://ads508-raw-data/synopsis/')
pd.read_sql(synopsis_table, con=conn)

In [123]:
rating_table = """CREATE EXTERNAL TABLE IF NOT EXISTS {}.{}(
         user_id int,
         anime_id int,
         rating int
) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LINES TERMINATED BY '\\n' LOCATION '{}'
TBLPROPERTIES ('compressionType'='gzip', 'skip.header.line.count'='1')""".format(dbname, 'ratings', 's3://ads508-raw-data/ratings/')
pd.read_sql(rating_table, con=conn)

In [109]:
watchlist_table = """CREATE EXTERNAL TABLE IF NOT EXISTS {}.{}(
         status int,
         description string
) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LINES TERMINATED BY '\\n' LOCATION '{}'
TBLPROPERTIES ('compressionType'='gzip', 'skip.header.line.count'='1')""".format(dbname, 'watching_status', 's3://ads508-raw-data/watching_status/')
pd.read_sql(watchlist_table, con=conn)

In [125]:
pd.read_sql('SELECT * FROM %s.%s LIMIT 20' % (dbname,'anime'), con=conn)

Unnamed: 0,mal_id,name,score,genres,english_name,japanese_name,type,episodes,aired,premiered,...,score_10,score_9,score_8,score_7,score_6,score_5,score_4,score_3,score_2,score_1
0,1,Cowboy Bebop,8,"""Action",Adventure,Comedy,Drama,,"Space""",Cowboy Bebop,...,39.0,1251960.0,61971,105808,718161,71513,26678,329800,229170,182126
1,5,Cowboy Bebop: Tengoku no Tobira,8,"""Action",Drama,Mystery,Sci-Fi,,Cowboy Bebop:The Movie,カウボーイビバップ 天国の扉,...,1174.0,4143.0,208333,1935,770,57964,30043,49201,49505,22632
2,6,Trigun,8,"""Action",Sci-Fi,Adventure,Comedy,,"Shounen""",Trigun,...,201.0,558913.0,12944,29113,343492,25465,13925,146918,50229,75651
3,7,Witch Hunter Robin,7,"""Action",Mystery,Police,Supernatural,,"Magic""",Witch Hunter Robin,...,,,2481,1467,94683,587,4300,46165,5121,5378
4,8,Bouken Ou Beet,6,"""Adventure",Fantasy,Shounen,"Supernatural""",,冒険王ビィト,TV,...,18.0,642.0,7314,766,1108,3394,312,529,1242,1713
5,15,Eyeshield 21,7,"""Action",Sports,Comedy,"Shounen""",,アイシールド21,TV,...,604.0,1003.0,148259,2066,13907,78349,14228,11573,30202,9226
6,16,Hachimitsu to Clover,8,"""Comedy",Drama,Josei,Romance,,Honey and Clover,ハチミツとクローバー,...,468.0,687.0,214499,4101,11909,81145,11901,11026,98518,11829
7,17,Hungry Heart: Wild Striker,7,"""Slice of Life",Comedy,Sports,"Shounen""",,ハングリーハート Wild Striker,TV,...,817.0,13778.0,828,1168,3879,1123,1777,3102,3075,1286
8,18,Initial D Fourth Stage,8,"""Action",Cars,Sports,Drama,,Unknown,頭文字〈イニシャル〉D FOURTH STAGE,...,117929.0,979.0,6082,90967,3053,1356,16471,10948,15820,22379
9,19,Monster,8,"""Drama",Horror,Mystery,Police,,Seinen,"Thriller""",...,,30.0,169,614100,29436,64648,214491,47488,23008,264465
