In [28]:
# Imports 
import pandas as pd
import helper as hp
import numpy as np
from sklearn import preprocessing
from scipy import stats

In [5]:
# Load data 
df = pd.read_csv('../data/clean_lvl1.csv')

# data description 
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1999 entries, 0 to 1998
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   rating         1999 non-null   float64
 1   date           1999 non-null   object 
 2   comment_count  1999 non-null   int64  
 3   name           1999 non-null   object 
dtypes: float64(1), int64(1), object(2)
memory usage: 62.6+ KB


### Columns 
|Column Name | Data Type | 
|------------|-----------|
|rating | float|
|date | date|
|comment_count| int|
|name|string|

### Name 

In [6]:
# Name Column: All lower case 
df['name'] = df['name'].apply(hp.lower)

In [7]:
# Name Column: Remove review from names 
df['name'] = df['name'].str.replace('review', '')

In [8]:
# Make string type
df['name'] = df['name'].astype('string')

### Date 

In [9]:
# Make date column a date dtype
df['date'] = df['date'].astype('datetime64[ns]')
type(df['date'][0])

pandas._libs.tslibs.timestamps.Timestamp

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1999 entries, 0 to 1998
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   rating         1999 non-null   float64       
 1   date           1999 non-null   datetime64[ns]
 2   comment_count  1999 non-null   int64         
 3   name           1999 non-null   string        
dtypes: datetime64[ns](1), float64(1), int64(1), string(1)
memory usage: 62.6 KB


### Make new columns 

In [11]:
# Seperate date into months and year 
df['month'] = df['date'].dt.month
df['year'] = df['date'].dt.year

In [12]:
'''
plt.style.use('_classic_test_patch')
_ = pd.plotting.scatter_matrix(df[['rating', 'comment_count', 'year']], figsize = (8, 4), diagonal = 'hist')
'''

"\nplt.style.use('_classic_test_patch')\n_ = pd.plotting.scatter_matrix(df[['rating', 'comment_count', 'year']], figsize = (8, 4), diagonal = 'hist')\n"

In [13]:
# List of console types 
df['console'] = df['name'].apply(hp.console)

In [14]:
# quantiles of rating and comments 
qrate = df['rating'].describe()[4:]
qccount = df['comment_count'].describe()[4:]

### Quantiles

|column|min|25%|50%|75%|max|
|------|---|---|---|---|---|
|rating|2|6.8|7.8|8.5|10|
|comment count|0|36|184|670|117881|

In [15]:
# Bin Ratings
hp.set_bin_level(6.8, 8.5, 8.6)
df['rate_rank'] = df['rating'].apply(hp.bin_)

In [16]:
# Bin comments 
hp.set_bin_level(36, 670, 671)
df['comment_rank'] = df['comment_count'].apply(hp.bin_)

In [35]:
# Save data 
df.to_csv('../data/ign_reviews.csv', index = False)