# Introduction to Pandas & NumPy

## Importing the librairies

In [None]:
# import the pandas and NumPy librairies
import pandas as pd
import numpy as np

## Introduction to NumPy

### Basic NumPy functions

In [None]:
# Create an array with 21 elements uniformly spaced between 0 and 10: first_array

### CODE HERE ###
 
print(first_array)

In [None]:
# Create an array of 21 integers randomly drawn between 0 and 10: rnd_array

### CODE HERE ###
 
print(rnd_array)

In [None]:
# Cast first_array to an array of integers: first_updated

### CODE HERE ###
 
# stack together first_updated and rnd_array: stack_array

### CODE HERE ###

print(stack_array)

In [None]:
# What is the average value of stack_array?

### CODE HERE ###
 
# What is the row-average of stack_array?

### CODE HERE ###


## Introduction to Pandas

### Inspecting DataFrames

In [None]:
# Load the googleplaystore.csv file into a pandas DataFrame

### CODE HERE ###
df = pd.read_csv('data/googleplaystore.csv')

In [None]:
# Print the columns of your DataFrame

### CODE HERE ###


In [None]:
# Select the first 5 rows of your DataFrame

### CODE HERE ###


In [None]:
# Show some basic statistical details of your DataFrame

### CODE HERE ###


In [None]:
# Show all the unique values in your DataFrame

### CODE HERE ###


### Updating DataFrames

In [None]:
# Create a new DataFrame called 'updated_apps' that doesn't contain the string 'Varies with device' in the 'Size' column

### CODE HERE ###


In [None]:
# Let's create a function to update a single string of the 'Size' column into a number

def update_value_loop(s):
    # replace 1,000
    s = s.replace('1,000','1000')
 
    # get value and character strings
    val = s[:-1]
    char = s[-1:]
 
    # convert to value
    number = float(val)
 
    # multiply value
    if char == 'k':
        return (number * 1000)
    elif char == 'M':
        return (number * 1000000)
    else:
        return number

First, let's update all the values in the 'Size' column by iterating over the DataFrames' rows and appending the results

In [None]:
%%timeit
app_sizes = []
 
for index, row in updated_apps.iterrows():
    app_sizes.append(update_value_loop(row['Size']))

Now, let's do the same using pandas' apply() function

In [None]:
%%timeit
app_sizes = updated_apps.apply(lambda row: update_value_loop(row['Size']), axis=1)

The apply() method is 5x faster!

In [None]:
# Let's create a similar function that operates on an entire Series at a time

def update_size_values_vectorized(s):
    # replace 1,000
    s = s.replace('1,000+','1000+')
    
    # remove last string
    val = s.str[:-1]
    char = s.str[-1:]
 
    # convert to value
    number = val.astype(float)

    # scale number based on suffix
    number[char == 'k'] = number[char == 'k'] * 1_000
    number[char == 'M'] = number[char == 'M'] * 1_000_000

    return number

It is possible to go even faster by using a vectorized function

In [None]:
%%timeit
app_sizes = update_size_values_vectorized(updated_apps['Size'])

There is a 10x improvement!

In [None]:
# Create a similar vectorized function to update the 'Price' variable of the DataFrame
# Tips: 
# (1) replaces 'Everyone' by '$0' and '0' by '$0'
# (2) removes the $ symbol, 
# (3) converts the strings to floats. 

def update_price(s):
    # Deal with the 'Everyone' issue
    
    ### CODE HERE ###

    # get value without the $ symbol
    
    ### CODE HERE ###
 
    # convert to float value and return the result
    
    ### CODE HERE ###


Let's see how fast our function is updating the Price variable

In [None]:
%%timeit
app_prices = update_price(updated_apps['Price'])

### Memory Management

In [None]:
# Let's create a function to compare the memory usage when storing a DataFrame's column as string vs category

def mem_usage(pandas_obj):
    if isinstance(pandas_obj,pd.DataFrame):
        usage_b = pandas_obj.memory_usage(deep=True).sum()
    else: # we assume if not a df it's a series
        usage_b = pandas_obj.memory_usage(deep=True)
    usage_mb = usage_b / 1024 # convert bytes to KB
    return "{:03.1f} kB".format(usage_mb)

In [None]:
# let's compare the memory usage when storing the DataFrame's 'Category' column as string vs category
print(mem_usage(google_apps['Category']))
print(mem_usage(google_apps['Category'].astype('category')))

Converting the column to caterogicals enabled ~50x memory improvement with 0 information loss!

### Pickling

In [None]:
# Apply the updates to the dataset before saving it
updated_apps['Size'] = updated_apps.apply(lambda row: update_value_loop(row['Size']), axis=1)
updated_apps['Price'] = update_price(updated_apps['Price'])

In [None]:
# Save the updated_apps dataset to pickle

### CODE HERE ###


In [None]:
# Save the updated_apps dataset to pickle and compress it to bz2

### CODE HERE ###


In [None]:
# Reload the compressed pickle file

### CODE HERE ###
