In [1]:
import sqlalchemy
import pandas as pd
import numpy as np
from database_credentials import get_database_url

In [2]:
def check_for_missing_values(data):
    '''Print whether or not a dataframe contains missing data 
    along with the number of missing entries
    
    Args:
        data (dataframe): dataframe to be checked
        
    Returns:
        None
    '''
    
    valid_mask = data.isnull()
    data_contains_missing_values = valid_mask.any().any()
    print("Contains missing values:", data_contains_missing_values)

    if (data_contains_missing_values):
        number_of_missing_values = valid_mask.sum().sum()
        print("Number of missing values:", number_of_missing_values)
    
def check_for_duplicate_ids(data):
    '''Print whether or not a dataframe contains duplicate 
    Steam IDs along with the number of duplicates
    
    Args:
        data (dataframe): dataframe to be checked
        
    Returns:
        None
    '''
    
    unique_id_mask = data['steam_id'].duplicated()
    data_contains_duplicates = unique_id_mask.any()
    print("Contains duplicate Steam IDs:", data_contains_duplicates)

    if (data_contains_duplicates):
        number_of_duplicates = len(data[unique_id_mask])
        print("Number of duplicated Steam IDs:", number_of_duplicates)
        
def display_info(data):
    '''Display the column names, number of rows, and information 
    regarding missing or duplicate values for a dataframe
    
    Args:
        data (dataframe): dataframe for which information will
                          be displayed
    Returns:
        None
    '''
    
    print("Features: {0}\n".format(data.columns.values))
    print("Number of rows: {0}\n".format(data.shape[0]))
    check_for_missing_values(data)
    print()
    check_for_duplicate_ids(data)
        
def mask_duplicate_ids(data):
    '''Mask rows containing duplicate Steam IDs
    
    Args:
        data (dataframe): dataframe with duplicate Steam IDs

    Returns:
        masked_data (dataframe): a copy of data without duplicate
                                 Steam IDs
    '''
    
    duplicate_id_mask = ~data['steam_id'].duplicated()
    masked_data = data[duplicate_id_mask]
    return masked_data

In [3]:
# connect to database
database_url = get_database_url()
engine = sqlalchemy.create_engine(database_url)
connection = engine.connect() 

In [4]:
# load data
review_query = '''
    SELECT 
        steam_id,
        user_review,
        FROM_UNIXTIME(timestamp_updated) AS timestamp_updated,
        recommends_game
    FROM reviews;
'''
reviews = pd.read_sql(review_query, connection)

user_query = '''
    SELECT 
        steam_id,
        CEIL(total_minutes_played / 60) AS total_hours_played,
        FROM_UNIXTIME(timestamp_last_played) AS timestamp_last_played
    FROM users;
'''
users = pd.read_sql(user_query, connection)

In [5]:
display(reviews.head(3))
display_info(reviews)

Unnamed: 0,steam_id,user_review,timestamp_updated,recommends_game
0,76561198109804262,The Game is smooth with great graphics. It sho...,2015-01-14 08:40:12,1
1,76561198144109929,AMAZING Best thing ever. Great graphics for a ...,2015-01-14 06:02:29,1
2,76561198074106169,Best f2p game out there,2015-01-14 03:20:10,1


Features: ['steam_id' 'user_review' 'timestamp_updated' 'recommends_game']

Number of rows: 149999

Contains missing values: False

Contains duplicate Steam IDs: True
Number of duplicated Steam IDs: 3


In [6]:
display(users.head(3))
display_info(users)

Unnamed: 0,steam_id,total_hours_played,timestamp_last_played
0,76561198109804262,1007,2019-05-07 04:44:48
1,76561198144109929,157,2020-01-07 19:00:17
2,76561198074106169,238,2019-10-25 14:07:33


Features: ['steam_id' 'total_hours_played' 'timestamp_last_played']

Number of rows: 149999

Contains missing values: False

Contains duplicate Steam IDs: True
Number of duplicated Steam IDs: 3


In [7]:
reviews = mask_duplicate_ids(reviews)
users = mask_duplicate_ids(users)