# Pandas 1

In [None]:
# Module naming abbreviation


## Learning Objectives:
- Create a pandas Series from a list or from a dict
- Use Series methods max, min, mean, median, mode, quantile, value counts
- Extract elements from a Series using Boolean indexing
- Access Series members using .loc, .iloc, .items, and slicing
- Perform Series element-wise operations

### What is pandas?
- pandas is a package of tools for doing Data Science
- pandas is installed on top of Python (https://en.wikipedia.org/wiki/Pandas_(software))
    - comes with Anaconda installation
    - If for some reason, you don't have pandas installed, run the following command in terminal or powershell
        <pre> pip install pandas </pre>

## pandas Series
- combination of dict and list
- can be created either from a python `list` or `dict`
- Terminology:
    - index (equivalent to key in python `dict`)
    - integer position (equivalent to index in python `list`)

## Create a series from a dict

In [None]:
# create a series from a dict
d = {"one": 7, "two": 8, "three": 9}
d

In [None]:
type(s)

In [None]:
s = pd.Series({"one": 7, "two": 8, "three": 9}) # equivalent to the above example
s

In [None]:
# IP  index    value
# 0   one      7
# 1   two      8
# 2   three    9

# dtype: int64

## Accessing values with index (.loc[...])

In [None]:
d

In [None]:
# dict access with key
d["one"]

## Accessing values with integer position (.iloc[...])

In [None]:
s

In [None]:
s.iloc[-1]

## Create a series from a list

In [None]:
# Series created from a list
num_list = [100, 200, 300]
s = pd.Series(num_list)
s

In [None]:
# IP  index value
# 0   0      100
# 1   1      200
# 2   2      300
# dtype: int64

In [None]:
print(s.loc[1])
print(s.iloc[1])

## Slicing series using integer positions

In [None]:
letters_list = ["A", "B", "C", "D"]
letters = pd.Series(letters_list)
letters

In [None]:
# list slicing
print(letters_list)
sliced_letter_list = letters_list[2:]
sliced_letter_list

Sliced Series retains original Series index, whereas integer positions are renumbered.

In [None]:
print(letters)
sliced_letters = ???
sliced_letters

In [None]:
# Note: integer positions get renumbered, whereas indexes do not.

# IP  Index  values
# 0   2       C
# 1   3       D
# dtype: object

In [None]:
print(sliced_letters.loc[2])
print(sliced_letters.iloc[0])

## Slicing series using index

In [None]:
s = pd.Series({"one": 7, "two": 8, "three": 9})
s

In [None]:
#slicing with indexes
s.loc["two":]

## Statistics on Series
- Use Series methods max, min, mean, median, mode, quantile, value counts

In [None]:
scores = pd.Series([44, 32, 19, 67, 23, 23, 92, 47, 47, 78, 84])
scores

In [None]:
print(scores.max())
print(scores.idxmax())

In [None]:
print(scores.min())
print(scores.idxmin())

In [None]:
scores.count()

In [None]:
scores.mean()

In [None]:
scores.std()

In [None]:
scores.median()

In [None]:
# there could be multiple modes, so mode returns a Series
scores.mode()

#### Quantile function
- enables to calculate percentiles
- takes as argument a float value between 0 and 1
- defaults to 50th percentile

In [None]:
scores.quantile(), scores.median()

In [None]:
scores.quantile(0.75) # 75th percentile

In [None]:
# 5-percentile summary
print(scores.quantile([0, 0.25, 0.5, 0.75, 1.0]))

#### value_counts()
- Series value_counts() creates a series where the key is the data, and the value is its count in the Series
- by default return value Series is ordered by descending order of the counts (values)

In [None]:
ages = pd.Series([18, 19, 20, 20, 20, 17, 18, 24, 25, 35, 22, 20, 21, 21, 20, 23, 23, 19, 19, 19, 20, 21])
age_counts = ages.value_counts()
age_counts

## Sorting
- sort_index()
- sort_values()

In [None]:
age_counts.sort_index()

In [None]:
age_counts.sort_values()

## Series bar chart

In [None]:
age_plot = age_counts.sort_index().plot.bar(color = 'blue')
age_plot.set(xlabel = "Age", ylabel = "Count")

## Element-wise operations
- Series op scalar

In [None]:
# Let's add 1 to everyone's age
print(ages.value_counts())


print(ages.value_counts())

## Boolean indexing

- applying boolean expressions on a Series
- boolean expression will be specified within the pair of [  ]
- Boolean operators:
    - & means 'and'
    - | means 'or'
    - ~ means 'not'
    - we must use () for compound boolean expressions

In [None]:
# Extract only ages > 21

b =  # gives you a boolean Series
b

In [None]:
# You can apply boolean Series to the original Series
 # now you get ages which are greater than 21

In [None]:
# combining the above two steps


### &, |, and ~
- & means 'and'
- | means 'or'
- ~ means 'not'
- we must use () for compound boolean expressions

In [None]:
# ages boolean
# what ages are in the range 18 to 20, inclusive?

print()

# what percentage of students are in this age range?

print()

In [None]:
# what percentage of  students are ages 18 OR 21?

# what percentage of students are NOT 19? 


## CS220 information survey data

In [None]:
# Modified from https://automatetheboringstuff.com/chapter14/
import csv
def process_csv(filename):
    example_file = open(filename, encoding="utf-8")
    example_reader = csv.reader(example_file)
    example_data = list(example_reader)
    example_file.close()
    return example_data

data = process_csv("cs220_survey_data.csv")
header = data[0]
data = data[1:]

In [None]:
header

In [None]:
data[:3]

In [None]:
# use list comprehension to extract just ages
age_list = 
print(len(age_list))
# use list comprehension to eliminate the large age
age_list = 
print(len(age_list))
# age_list

In [None]:
cs220_ages = pd.Series(age_list)
cs220_ages

In [None]:
# Make a Series of the counts of all the ages, sorted from most common to least 
# then sort it by index


In [None]:
# make a bar chart of the ages sorted by age
age_plot = cs220_ages.value_counts().sort_index().plot.bar(color='blue')
age_plot.set(xlabel = "age", ylabel = "count")

### Statistics

### What is the mode of CS220 student ages?

### What is the 75th percentile of ages?

## Element-wise operations
1. SERIES op SCALAR
2. SERIES op SERIES

In [None]:
## Series from a dict
game1_points = pd.Series({"Chris": 10, "Kiara": 3, "Mikayla": 7, "Ann": 8, "Trish": 6})
print(game1_points)
game2_points = pd.Series({"Kiara": 7, "Chris": 3,  "Trish": 11, "Mikayla": 2, "Ann": 5, "Meena": 20})
print(game2_points)

### Give 2 additional points for every player's game 1 score

In [None]:
game1_points + 2

In [None]:
game1_points = game1_points + 2
game1_points

### Give 3 additional points for every player's game 2 score

In [None]:
game2_points += 3
game2_points

### Compute total of two series

In [None]:
# Pandas can perform operations on two series by matching up their indices
total = game1_points + game2_points
total

### Who has the highest points?

In [None]:
## Who has the most points?
print(total.max())
print(total.idxmax())

In [None]:
print(total['Kiara'], total[2])

In [None]:
s = pd.Series([10, 2, 3, 15])
s

### Find all values > 8

In [None]:
# gives a boolean Series, where each value is True if the original Series values satifies the condition
b = s > 8
b

In [None]:
# now let's apply the boolean expression, which gives a boolean Series
s[b]

In [None]:
# Equivalently, you can directly specify boolean expression inside the [ ]
s[s > 8]

In [None]:
# Decomposing the steps here
# Above example is equivalent to
b = pd.Series([True, False, False, True])
s[b]

### How many students are 25 years or older?

### How many students are in the age range 18 to 20, inclusive?

### What percentage of  students are ages 18 OR 21?