# vectorization

In [1]:
import pandas as pd 
emp = pd.read_csv('../data/employee.csv')
emp.head(3)

Unnamed: 0,title,dept,salary,race,gender,hire_date,job_date
0,ASSISTANT DIRECTOR (EX LVL),Municipal Courts Department,121862.0,Hispanic,Female,2006-06-12,2012-10-13
1,LIBRARY ASSISTANT,Library,26125.0,Hispanic,Female,2000-07-19,2010-09-18
2,POLICE OFFICER,Houston Police Department-HPD,45279.0,White,Male,2015-02-03,2015-02-03


In [2]:
salary = emp['salary']
salary.head()

0    121862.0
1     26125.0
2     45279.0
3     63166.0
4     56347.0
Name: salary, dtype: float64

In [4]:
(salary + 100).head()

0    121962.0
1     26225.0
2     45379.0
3     63266.0
4     56447.0
Name: salary, dtype: float64

# descriptive

In [5]:
salary.mean()

55767.93160127253

In [6]:
salary.min()

24960.0

In [7]:
salary.count()

1886

In [8]:
len(salary)

2000

# agg vs non-agg

In [10]:
salary.abs().head()

0    121862.0
1     26125.0
2     45279.0
3     63166.0
4     56347.0
Name: salary, dtype: float64

# boolean

In [11]:
s= salary > 10 ** 5
s.head()

0     True
1    False
2    False
3    False
4    False
Name: salary, dtype: bool

In [13]:
emp[s].head()

Unnamed: 0,title,dept,salary,race,gender,hire_date,job_date
0,ASSISTANT DIRECTOR (EX LVL),Municipal Courts Department,121862.0,Hispanic,Female,2006-06-12,2012-10-13
8,DEPUTY ASSISTANT DIRECTOR (EXECUTIVE LEV,Public Works & Engineering-PWE,107962.0,White,Male,1993-11-15,2013-01-05
11,"CHIEF PHYSICIAN,MD",Health & Human Services,180416.0,Black,Male,1987-05-22,1999-08-28
43,ASSOCIATE EMS PHYSICIAN DIRECTOR,Houston Fire Department (HFD),165216.0,Hispanic,Male,2013-08-31,2013-08-31
66,"PUBLIC HEALTH DENTIST,DDS",Health & Human Services,100791.0,White,Female,2015-12-28,2015-12-28


In [14]:
s.sum()

57

In [15]:
s.mean()

0.0285

In [16]:
len(s)

2000

In [21]:
race = emp['race']
race


0       Hispanic
1       Hispanic
2          White
3          White
4          White
5          Black
6          Asian
7          White
8          White
9            NaN
10      Hispanic
11         Black
12         Black
13         Black
14         Black
15         Black
16      Hispanic
17         White
18      Hispanic
19         White
20         White
21         Black
22         Black
23         White
24         Black
25         White
26      Hispanic
27         White
28         White
29         Black
          ...   
1970    Hispanic
1971    Hispanic
1972       White
1973       White
1974       Black
1975       Asian
1976    Hispanic
1977       White
1978       Black
1979    Hispanic
1980       White
1981       Black
1982       Black
1983       Black
1984       White
1985       Black
1986       White
1987       White
1988       White
1989       Black
1990       Black
1991    Hispanic
1992    Hispanic
1993       White
1994       Black
1995       White
1996       Black
1997       Whi

In [23]:
race_black = race == 'Black'
race_black.head(10)

0    False
1    False
2    False
3    False
4    False
5     True
6    False
7    False
8    False
9    False
Name: race, dtype: bool

In [25]:
race_black.sum()

700

In [27]:
race_total = len(race_black)

In [28]:
race_black.sum() / race_total

0.35

# sorting

In [31]:
salary_sorted = salary.sort_values()
salary_sorted.head()

454     24960.0
488     26104.0
1844    26125.0
92      26125.0
1124    26125.0
Name: salary, dtype: float64

In [32]:
salary.head()

0    121862.0
1     26125.0
2     45279.0
3     63166.0
4     56347.0
Name: salary, dtype: float64

# experimenting - learn a method by exploring it

In [35]:
salary.duplicated

True

# Methods vs Selection

In [None]:
salary[]

# string columns

In [36]:
race = emp['race']
race.head()

0    Hispanic
1    Hispanic
2       White
3       White
4       White
Name: race, dtype: object

In [37]:
race.value_counts() #only works for series NOT for DataFrame

Black              700
White              665
Hispanic           480
Asian              107
Native American     11
Other                2
Name: race, dtype: int64

In [45]:
race.value_counts(normalize=True)

Black              0.356234
White              0.338422
Hispanic           0.244275
Asian              0.054453
Native American    0.005598
Other              0.001018
Name: race, dtype: float64

In [42]:
race.count()

1965

In [43]:
len(race)

2000

In [49]:
sal_bins = salary.value_counts(bins=5, normalize=True)
sal_bins

(24709.959, 74968.0]    0.8105
(74968.0, 124976.0]     0.1225
(124976.0, 174984.0]    0.0070
(174984.0, 224992.0]    0.0025
(224992.0, 275000.0]    0.0005
Name: salary, dtype: float64

In [50]:
sal_bins.sort_index()

(24709.959, 74968.0]    0.8105
(74968.0, 124976.0]     0.1225
(124976.0, 174984.0]    0.0070
(174984.0, 224992.0]    0.0025
(224992.0, 275000.0]    0.0005
Name: salary, dtype: float64

# str accessor

In [62]:
race.str.upper().head()

0    HISPANIC
1    HISPANIC
2       WHITE
3       WHITE
4       WHITE
Name: race, dtype: object

In [53]:
race.str.endswith('ite').head()

0    False
1    False
2     True
3     True
4     True
Name: race, dtype: object

In [55]:
race.str.len().head()

0    8.0
1    8.0
2    5.0
3    5.0
4    5.0
Name: race, dtype: float64

In [57]:
emp_title = emp['title']
emp_title.head()

0    ASSISTANT DIRECTOR (EX LVL)
1              LIBRARY ASSISTANT
2                 POLICE OFFICER
3              ENGINEER/OPERATOR
4                    ELECTRICIAN
Name: title, dtype: object

In [59]:
emp_title.str.title().head()

0    Assistant Director (Ex Lvl)
1              Library Assistant
2                 Police Officer
3              Engineer/Operator
4                    Electrician
Name: title, dtype: object

In [61]:
emp_title.str.contains('a').head()

0    False
1    False
2    False
3    False
4    False
Name: title, dtype: bool