# 1. PANDAS BASICS

In [3]:
# Pandas is a data manipulation and analysis tool that is built on Numpy.
# Pandas uses a data structure known as DataFrame (think of it as Microsoft excel in Python). 
# DataFrames empower programmers to store and manipulate data in a tabular fashion (rows and columns).
# Series Vs. DataFrame? Series is considered a single column of a DataFrame.

In [4]:
import pandas as pd

In [5]:
# Let's define two lists as shown below:

my_list = ['APPL','AMZN','TSLA']
my_list 

['APPL', 'AMZN', 'TSLA']

In [6]:
label = ['stock 1', 'stock 2','stock 3']
label

['stock 1', 'stock 2', 'stock 3']

In [7]:
# Let's create a one dimensional Pandas "series" 
# Note that series is formed of data and associated labels 
x_series = pd.Series(my_list,label)

In [8]:
# Let's view the series
x_series

stock 1    APPL
stock 2    AMZN
stock 3    TSLA
dtype: object

In [9]:
# Let's obtain the datatype
type(x_series)

pandas.core.series.Series

In [10]:
# Let's define a two-dimensional Pandas DataFrame
# Note that you can create a pandas dataframe from a python dictionary
bc_df = pd.DataFrame({'Bank client ID':['111','222','333','444'],'Client name':['Chanel','Steve','Mitch','Ryan'],'Net worth[$]':['3500','29000','10000','2000'],'Years with bank':['3','4','9','5']})
bc_df

Unnamed: 0,Bank client ID,Client name,Net worth[$],Years with bank
0,111,Chanel,3500,3
1,222,Steve,29000,4
2,333,Mitch,10000,9
3,444,Ryan,2000,5


In [11]:
# Let's obtain the data type 
type(bc_df)

pandas.core.frame.DataFrame

In [12]:
# you can only view the first couple of rows using .head()
bc_df.head(2)

Unnamed: 0,Bank client ID,Client name,Net worth[$],Years with bank
0,111,Chanel,3500,3
1,222,Steve,29000,4


In [13]:
# you can only view the last couple of rows using .tail()
bc_df.tail(2)

Unnamed: 0,Bank client ID,Client name,Net worth[$],Years with bank
2,333,Mitch,10000,9
3,444,Ryan,2000,5


**MINI CHALLENGE #1:**
- **A porfolio contains a collection of securities such as stocks, bonds and ETFs. Define a dataframe named 'portfolio_df' that holds 3 different stock ticker symbols, number of shares, and price per share (feel free to choose any stocks)**
- **Calculate the total value of the porfolio including all stocks**

In [14]:
portfolio_df = pd.DataFrame({'Stock':['AMZN','MSFT','MCQG','TSLA'],'Number of Shares':[2,5,1,8],'PPS[$]':[300,150,170,300]})
portfolio_df

Unnamed: 0,Stock,Number of Shares,PPS[$]
0,AMZN,2,300
1,MSFT,5,150
2,MCQG,1,170
3,TSLA,8,300


In [15]:
# multiply the rows
totalstock = portfolio_df['PPS[$]'] * portfolio_df['Number of Shares']
# .sum to calculate the column, "PPS[$]"
print("The total value of the portfolio is ${:0,.2f}".format(totalstock.sum()))

The total value of the portfolio is $3,920.00


# 2. PANDAS WITH CSV AND HTML DATA

In [16]:
# In order to access data on Google Drive, you need to mount the drive to access it's content

import os 
os.getcwd()


'c:\\Users\\Owner\\Documents\\Projects\\StockPrediction\\Section10-pandas'

In [17]:
import csv 
bankinfo = csv.reader('bank_client_information.csv')

In [19]:
# Pandas is used to read a csv file and store data in a DataFrame
df = pd.read_csv('bank_client_information.csv')
df

Unnamed: 0,First Name,Last Name,Email,Postal Code,Net Worth
0,Joseph,Patton,daafeja@boh.jm,M6U 5U7,"$2,629.13"
1,Noah,Moran,guutodi@bigwoc.kw,K2D 4M9,"$8,626.96"
2,Nina,Keller,azikez@gahew.mr,S1T 4E6,"$9,072.02"


In [None]:
# write to a csv file without an index


In [None]:
# write to a csv file with an index


In [None]:
# Read tabular data using read_html


**MINI CHALLENGE #2:**
- **Write a code that uses Pandas to read tabular US retirement data**
- **You can use data from here: https://www.ssa.gov/oact/progdata/nra.html** 

# 3. PANDAS OPERATIONS

In [None]:
# Let's define a dataframe as follows:


In [None]:
# Pick certain rows that satisfy a certain criteria 


In [None]:
# Delete a column from a DataFrame


**MINI CHALLENGE #3:**
- **Using "bank_client_df" DataFrame, leverage pandas operations to only select high networth individuals with minimum $5000** 
- **What is the combined networth for all customers with 5000+ networth?**

# 4. PANDAS WITH FUNCTIONS

In [None]:
# Let's define a dataframe as follows:


In [None]:
# Define a function that increases all clients networth (stocks) by a fixed value of 10% (for simplicity sake) 


In [None]:
# You can apply a function to the DataFrame 


**MINI CHALLENGE #4:**
- **Define a function that doubles stock prices and adds $100**
- **Apply the function to the DataFrame**
- **Calculate the updated total networth of all clients combined**

# 5. SORTING AND ORDERING

In [None]:
# Let's define a dataframe as follows:


In [None]:
# You can sort the values in the dataframe according to number of years with bank


In [None]:
# Note that nothing changed in memory! you have to make sure that inplace is set to True


In [None]:
# Set inplace = True to ensure that change has taken place in memory 


In [None]:
# Note that now the change (ordering) took place 


**MINI CHALLENGE #5:**
- **Sort customers by networth instead of years with bank. Make sure to update values in-memory.**

# 6. CONCATENATING AND MERGING WITH PANDAS

Check this out: https://pandas.pydata.org/pandas-docs/stable/user_guide/merging.html

In [None]:
# Creating a dataframe from a dictionary
# Let's define a dataframe with a list of bank clients with IDs = 1, 2, 3, 4, 5 



In [None]:
# Let's define another dataframe for a separate list of clients (IDs = 6, 7, 8, 9, 10)


In [None]:
# Let's assume we obtained additional information (Annual Salary) about our bank customers 
# Note that data obtained is for all clients with IDs 1 to 10
 

In [None]:
# Let's concatenate both dataframes #1 and #2
# Note that we now have client IDs from 1 to 10


In [None]:
# Let's merge all data on 'Bank Client ID'


**MINI CHALLENGE #6:**
- **Let's assume that you became a new client to the bank**
- **Define a new DataFrame that contains your information such as client ID (choose 11), first name, last name, and annual salary.**
- **Add this new dataframe to the original dataframe "bank_df_all".** 

# EXCELLENT JOB!

# MINI CHALLENGE SOLUTIONS

**MINI CHALLENGE #1 SOLUTION:**
- **A porfolio contains a collection of securities such as stocks, bonds and ETFs. Define a dataframe named 'portfolio_df' that holds 3 different stock ticker symbols, number of shares, and price per share (feel free to choose any stocks)**
- **Calculate the total value of the porfolio including all stocks**

In [None]:
portfolio_df = pd.DataFrame({'stock ticker symbols':['AAPL', 'AMZN', 'T'],
                             'price per share [$]':[3500, 200, 40], 
                             'Number of stocks':[3, 4, 9]})
portfolio_df

In [None]:
stocks_dollar_value = portfolio_df['price per share [$]'] * portfolio_df['Number of stocks']
print(stocks_dollar_value)
print('Total portfolio value = {}'.format(stocks_dollar_value.sum()))

**MINI CHALLENGE #2 SOLUTION:**
- **Write a code that uses Pandas to read tabular US retirement data**
- **You can use data from here: https://www.ssa.gov/oact/progdata/nra.html** 

In [None]:
# Read tabular data using read_html
retirement_age_df = pd.read_html('https://www.ssa.gov/oact/progdata/nra.html')
retirement_age_df

**MINI CHALLENGE #3 SOLUTION:**
- **Using "bank_client_df" DataFrame, leverage pandas operations to only select high networth individuals with minimum $5000** 
- **What is the combined networth for all customers with 5000+ networth?**

In [None]:
df_high_networth = bank_client_df[ (bank_client_df['Net worth [$]'] >= 5000) ]
df_high_networth

In [None]:
df_high_networth['Net worth [$]'].sum()

**MINI CHALLENGE #4 SOLUTION:**
- **Define a function that doubles stock prices and adds $100**
- **Apply the function to the DataFrame**
- **Calculate the updated total networth of all clients combined**

In [None]:
def networth_update(balance):
    return balance * 2 + 100 # assume that stock prices increased by 10%

In [None]:
# You can apply a function to the DataFrame 
results = bank_client_df['Net worth [$]'].apply(networth_update)
results

In [None]:
results.sum()

**MINI CHALLENGE #5 SOLUTION:**
- **Sort customers by networth instead of years with bank. Make sure to update values in-memory.**

In [None]:
# You can sort the values in the dataframe according to number of years with bank
bank_client_df.sort_values(by = 'Net worth [$]', inplace = True) 
bank_client_df

**MINI CHALLENGE #6 SOLUTION:**
- **Let's assume that you became a new client to the bank**
- **Define a new DataFrame that contains your information such as client ID (choose 11), first name, last name, and annual salary.**
- **Add this new dataframe to the original dataframe "bank_df_all".** 

In [None]:
new_client = {
        'Bank Client ID': ['11'],
        'First Name': ['Ry'], 
        'Last Name': ['Aly'],
        'Annual Salary [$/year]' : [1000]}
new_client_df = pd.DataFrame(new_client, columns = ['Bank Client ID', 'First Name', 'Last Name', 'Annual Salary [$/year]'])
new_client_df

In [None]:
new_df = pd.concat([bank_df_all, new_client_df], axis = 0)
new_df