In [8]:
import numpy as np

#fundamental data-type in np is the 2D array, arrays are referenced by axis 
#Axis 1 -- operates across rows
#Axis 0 -- operates across columns (default)

In [9]:
A = np.arange(16) #np.arange() like range() function 

In [10]:
A.reshape(4,4) #reshapes a 1D array into a N x N 2D array

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11],
       [12, 13, 14, 15]])

In [11]:
B = A.reshape(4,4) 
#important! .reshape(N,N) creates a view of the 1D array as an N x N array 
#however if we were to call A[1,1], we would get errors 

#we must assign the reshaped array to a new variable in order to get
#the 2D data type into memory 

In [12]:
A[1][1] #note the error, because A is still a 1D array 

IndexError: invalid index to scalar variable.

In [13]:
B[1][1]

5

In [14]:
#.mean() called upon a 1D or 2D array calculates the mean 

In [15]:
B.mean()

7.5

In [16]:
B[0].mean()

1.5

In [17]:
#important! 
#numpy gives you what you want in terms of matrix indexing 

#B[0][0] yields the same result as B[0,0]

In [18]:
if B[0,0] == B[0][0]:
    print(True)

True


In [19]:
#Subsetting 
#we can then extend our notion of slicing to the 2D setting 

B[: , :] #this grabs the entire 2D array as is
         #just as A[:] grabs the entire 1D array as is

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11],
       [12, 13, 14, 15]])

In [20]:
B[1:3, :] #grab a subset of the rows

array([[ 4,  5,  6,  7],
       [ 8,  9, 10, 11]])

In [21]:
B[:, 1:4] #grab a subset of the columns 

array([[ 1,  2,  3],
       [ 5,  6,  7],
       [ 9, 10, 11],
       [13, 14, 15]])

In [22]:
B[1:3, 1:3] #grab a subset of the rows/columns 
            #essentially allows us to grab minors

array([[ 5,  6],
       [ 9, 10]])

In [23]:
B

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11],
       [12, 13, 14, 15]])

In [24]:
#unlike with regular python you can also pass in a list of values into the np slicer 
#ex
B[: , [0,3] ] 

array([[ 0,  3],
       [ 4,  7],
       [ 8, 11],
       [12, 15]])

In [25]:
B[ [0,2] , :]
#this allows you to grab specific rows/columns 
#without having to pull ALL of the rows/columns inbetween those desired 

array([[ 0,  1,  2,  3],
       [ 8,  9, 10, 11]])

In [26]:
#important! 
#axis 1 refers to rows
#axis 0 refers to columns

In [27]:
B

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11],
       [12, 13, 14, 15]])

In [28]:
B.mean(axis = 0) #or B.mean(0) in shorthand 
#computes the column means of the 2D array 

#note an array of means is returned when we specify the axis 

array([6., 7., 8., 9.])

In [29]:
B.mean(axis = 1) 

array([ 1.5,  5.5,  9.5, 13.5])

In [30]:
import pandas as pd

In [31]:
#Pandas connects to 
#CSV
#Excel
#SQL
#JSON
#STATA
#SAS
#HTML

#key function for reading data is pd.read_datatype()
#ex/ pd.read_CSV( file_path ) -- takes a filepath as input 

In [32]:
#SECTION 1 -- basic indexing 

#lets import the titanic dataset for practice 

In [33]:
df = pd.read_csv("/Users/theodoreplotkin/desktop/postmalone/GA_Data_Science/DAT-06-24/class material/Unit 2/data/titanic.csv")

In [34]:
df.head() #returns the first 5 rows by default 

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [35]:
df.tail() #returns the last 5 rows by default 
          #can take an int as argument specifying how many rows you would like 

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.45,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0,C148,C
890,891,0,3,"Dooley, Mr. Patrick",male,32.0,0,0,370376,7.75,,Q


In [36]:
#just like with a dictionary, 
#you can pull specific columns by referencing them by their labels <strings>
df["PassengerId"].head()

0    1
1    2
2    3
3    4
4    5
Name: PassengerId, dtype: int64

In [37]:
df["Sex"].head()

0      male
1    female
2    female
3    female
4      male
Name: Sex, dtype: object

In [38]:
#you can also pass a list of strings 
df[["Age","Survived", "Fare"]].head()

Unnamed: 0,Age,Survived,Fare
0,22.0,0,7.25
1,38.0,1,71.2833
2,26.0,1,7.925
3,35.0,1,53.1
4,35.0,0,8.05


In [39]:
#also note once you've referenced a specific column (or set of columns),
#you can apply slice notation to grab a specific subset of rows 
df[["Age","Survived", "Fare"]][20:31]

#just think of df[["Age","Survived", "Fare"]] as some list A 

Unnamed: 0,Age,Survived,Fare
20,35.0,0,26.0
21,34.0,1,13.0
22,15.0,1,8.0292
23,28.0,1,35.5
24,8.0,0,21.075
25,38.0,1,31.3875
26,,0,7.225
27,19.0,0,263.0
28,,1,7.8792
29,,0,7.8958


In [40]:
df[["Age", "Fare"]].mean() #the .mean() method from np applies to these lists 
#recall the default argument for mean is axis = 0 (i.e. compute column means) 
#(this makes sense since in most settings computing row means would be nonsensical)

Age     29.699118
Fare    32.204208
dtype: float64

In [41]:
df[["Age", "Fare"]][20:31].mean() #allows us to compute the mean of any subset 

Age     27.125000
Fare    40.792045
dtype: float64

In [42]:
#.loc() and .iloc() methods for selecting and querying data 

In [43]:
#.loc() grabs data by its labels 
#.iloc() grabs data by its index position (hence the i prefix)

try:
    df[1:5,3:5]
except:
    print("Note: you must use the .iloc method to grab specific subsets of a dataframe by integer index")

Note: you must use the .iloc method to grab specific subsets of a dataframe by integer index


In [44]:
df.iloc[1:5,[3,4]] #grabs rows 1-4 and columns 3 and 4 
                   #note that 3 <--> "Name" 
                   #          4 <--> "Sex"   
        
#df.iloc[a:b, c:d] or df.iloc[[a,b,e], [c,d,f]] 

#so .iloc[:,:] basically allows us to apply the standard numpy slicer on a pandas dataframe

Unnamed: 0,Name,Sex
1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female
2,"Heikkinen, Miss. Laina",female
3,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female
4,"Allen, Mr. William Henry",male


In [45]:
try:
    df.loc[1:5,3:4]
except:
    print("Note: We get errors when we attempt to use integer indexing with the df.loc[:,:] method")

Note: We get errors when we attempt to use integer indexing with the df.loc[:,:] method


In [46]:
df.loc[[1,3,4], ["Sex","Survived"]] #note that df.loc[:,:] can take int arguments for the row reference
                                    #and string arguments for the column reference

Unnamed: 0,Sex,Survived
1,female,1
3,female,1
4,male,0


In [47]:
#df.loc[:, :] is more flexible than df.iloc[: , :]

#because df.loc[:, :] allows us to pass a list of strings (column names) as the second argument 
#but as with df.iloc[:, :] we can pass a range (or list) of integers to reference rows

#we could also assign string indices to the rows and then pass a list of strings as the first argument in df.loc[:,:]

In [48]:
#note the .columns method returns the column names (strings)

In [49]:
#.tolist() converts the resulting output of .columns into a regular python list

In [50]:
column_names = df.columns.tolist()

In [51]:
column_names

['PassengerId',
 'Survived',
 'Pclass',
 'Name',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Ticket',
 'Fare',
 'Cabin',
 'Embarked']

In [52]:
#SECTION 2 -- grabbing subsets of the data based on conditions 

#the idea is as follows::

#df.column_name followed by some condition
#for ex/

df.Sex == "male"
#so this loops thru the entire Sex column, and returns True if sex == "male", False otherwise 

0       True
1      False
2      False
3      False
4       True
5       True
6       True
7       True
8      False
9      False
10     False
11     False
12      True
13      True
14     False
15     False
16      True
17      True
18     False
19     False
20      True
21      True
22     False
23      True
24     False
25     False
26      True
27      True
28     False
29      True
       ...  
861     True
862    False
863    False
864     True
865    False
866    False
867     True
868     True
869     True
870     True
871    False
872     True
873     True
874    False
875    False
876     True
877     True
878     True
879    False
880    False
881     True
882    False
883     True
884     True
885    False
886     True
887    False
888    False
889     True
890     True
Name: Sex, Length: 891, dtype: bool

In [53]:
df.Age > 30

0      False
1       True
2      False
3       True
4       True
5      False
6       True
7      False
8      False
9      False
10     False
11      True
12     False
13      True
14     False
15      True
16     False
17     False
18      True
19     False
20      True
21      True
22     False
23     False
24     False
25      True
26     False
27     False
28     False
29     False
       ...  
861    False
862     True
863    False
864    False
865     True
866    False
867     True
868    False
869    False
870    False
871     True
872     True
873     True
874    False
875    False
876    False
877    False
878    False
879     True
880    False
881     True
882    False
883    False
884    False
885     True
886    False
887    False
888    False
889    False
890     True
Name: Age, Length: 891, dtype: bool

In [54]:
#this loops thru the Age column and returns True if the age the passenger is greater than 30 

#essentially we are generating a list of booleans which can be used to subset the dataframe
df[df.Age > 30].head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
11,12,1,1,"Bonnell, Miss. Elizabeth",female,58.0,0,0,113783,26.55,C103,S


In [55]:
#just like we can pass column names into the df and return specific columns
#we can also pass these conditions into the df and return only those rows which satisfy the conditions

#we can then use & "and" and | "or" python set operators to chain conditions together

df[(df.Age > 30) & (df.Sex == "male")].head()

#note that each condition must be placed in ()

#so the general form is df[(condition 1) & (condition 2) & ... & (condition n)]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
13,14,0,3,"Andersson, Mr. Anders Johan",male,39.0,1,5,347082,31.275,,S
20,21,0,2,"Fynney, Mr. Joseph J",male,35.0,0,0,239865,26.0,,S
21,22,1,2,"Beesley, Mr. Lawrence",male,34.0,0,0,248698,13.0,D56,S


In [56]:
#observe we are now only returning those rows which correspond to males over the age of 30 

print("There were",len(df[(df.Age > 30) & (df.Sex == "male")]), "passengers who were both male and older than 30")
#also the len() function allows us to count the number of rows (passengers) which satisfy this condition

There were 202 passengers who were both male and older than 30


In [57]:
df[(df.Age > 30) & (df.Sex == "male")]["Survived"].head()
#we can then view specific columns under our subset conditions

4     0
6     0
13    0
20    0
21    1
Name: Survived, dtype: int64

In [58]:
df[(df.Age > 30) & (df.Sex == "male")]["Survived"][:30]
#we can also slice specific columns viewed under our subset conditions

4      0
6      0
13     0
20     0
21     1
30     0
33     0
35     0
54     0
62     0
70     0
74     1
92     0
94     0
96     0
99     0
103    0
104    0
108    0
110    0
116    0
122    0
124    0
129    0
130    0
137    0
148    0
149    0
150    0
152    0
Name: Survived, dtype: int64

In [59]:
df[(df.Age > 30) & (df.Sex == "male")]["Survived"].mean()

0.20297029702970298

In [60]:
#this tells us that 20% of the males over age 30 survived 

In [61]:
df[df.Sex == "male"]["Survived"].mean()
#this tells us that 18% of males survived

0.18890814558058924

In [62]:
df[df.Sex == "female"]["Survived"].mean()
#this tells us that 74% of females survived

0.7420382165605095

In [63]:
#SECTION 3 -- Creating New Data

In [64]:
#key methods:

#np.where()
#np.select()
#df.map()
#pd.cut()
#df.apply()

In [65]:
#we create a new column in a way that is analogous to creating a new key,value pair in a dictionary 

#(the reason for this is because a dataframe is basically a dictionary, with the key being the column name
# and the value being the column of data (a list))

In [69]:
df["Age_Squared"] = df["Age"] ** 2
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Age Squared,Age_Squared
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,484.0,484.0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,1444.0,1444.0
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,676.0,676.0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,1225.0,1225.0
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,1225.0,1225.0


In [None]:
#observe we have created a new variable "Age Squared" which is our original age column, but squared
#this new column has been appended onto our original dataframe since dataframes are mutable objects 

In [None]:
#now we create a new variable called "family size" which is the sum of two preexisting columns, "Parch" and "SibSp"

In [73]:
df["Family_Size"] = df["Parch"] + df["SibSp"]
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Age Squared,Age_Squared,Family_Size
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,484.0,484.0,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,1444.0,1444.0,1
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,676.0,676.0,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,1225.0,1225.0,1
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,1225.0,1225.0,0


In [74]:
#what if we want to create new information based on what already exists in the df?

#key methods for this in numpy are np.select() and np.where()

In [75]:
#np.where() applies in situations involving binary choice <T or F>

#np.where() loops thru the array and checks if the condition (or set of conditions) are satisfied
    #returns a list of booleans with customizable labels 

#np.where() takes a boolean condition (or chain of conditions connected by & or |) as a first argument
#           the second argument is a label (string/int/float) placed where the condition evalutes to true
#           the third argument is a label (string/int/float) placed where the condition evalutes to false 

#np.where( conditions, indicator for True evaluation, indicator for False evaluation)

In [99]:
#ex/ 
a = np.array([[1, 2, 3], [4, 5, 6]])
b = np.where(a < 4, "cat", "dog")
b
#note True <--> "cat" and False <--> "dog"

array([['cat', 'cat', 'cat'],
       ['dog', 'dog', 'dog']], dtype='<U3')

In [None]:
#ex using our titanic dataset/

#lets create a new column "Adult_Or_Not" which contains the values "Adult" or 
#"Adolescent", depending on if the age in the row is greater than 25

In [97]:
df["Adult_Or_Not"] = np.where(df.Age >= 25, "Adult", "Adolescent")
df[["Age","Adult_Or_Not"]].head(10)

Unnamed: 0,Age,Adult_Or_Not
0,22.0,Adolescent
1,38.0,Adult
2,26.0,Adult
3,35.0,Adult
4,35.0,Adult
5,,Adolescent
6,54.0,Adult
7,2.0,Adolescent
8,27.0,Adult
9,14.0,Adolescent


In [None]:
#note that NaN's automatically evalute to false under the np.where() method

In [None]:
#np.select() applies in situations with more complicated conditions 

#the first argument is a list of conditions 

#the second argument is a list of corresponding labels, placed in the resulting array
#    depending on which condition in the list of conditions evalutes to True 

#the third argument is a label which is placed in the resulting array if none of the conditions 
#    are satisfied by that row (i.e. if there is a missing value in that row)
#    the default value is 0 if no third argument is provided 

In [103]:
#ex/ using the titanic dataset, create a new column called "Status"
#    Status will track if that row is both Male/Female and Adolescent/Adult

#first we create the list of conditions on the df

conditions = [(df.Adult_Or_Not == "Adult") & (df.Sex == "female"),
              (df.Adult_Or_Not == "Adult") & (df.Sex == "male"),
              (df.Adult_Or_Not == "Adolescent") & (df.Sex == "female"),
              (df.Adult_Or_Not == "Adolescent") & (df.Sex == "male")]

#next we create a list of labels which correspond to our conditions and will be placed according to which
#    evalutes to True
results = ["Adult Female", "Adult Male", "Adolescent Female", "Adolescent Male"]

#then call the np.select method
df["Status"] = np.select(conditions, results, "other")
df[["Sex","Age","Adult_Or_Not","Status"]].head(10)

Unnamed: 0,Sex,Age,Adult_Or_Not,Status
0,male,22.0,Adolescent,Adolescent Male
1,female,38.0,Adult,Adult Female
2,female,26.0,Adult,Adult Female
3,female,35.0,Adult,Adult Female
4,male,35.0,Adult,Adult Male
5,male,,Adolescent,Adolescent Male
6,male,54.0,Adult,Adult Male
7,male,2.0,Adolescent,Adolescent Male
8,female,27.0,Adult,Adult Female
9,female,14.0,Adolescent,Adolescent Female
