In [1]:
import pandas as pd

In [2]:
# Create a DataFrame
df = pd.DataFrame({
    "id":[28,17,9],
    "name":["Kyle","Erin","Claire"],
    "height (cm)": [75.2,62.7,65.3],
    "gender":["M","F","F"]
})
df

Unnamed: 0,id,name,height (cm),gender
0,28,Kyle,75.2,M
1,17,Erin,62.7,F
2,9,Claire,65.3,F


In [3]:
df.info() # prints out the data types of each column

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   id           3 non-null      int64  
 1   name         3 non-null      object 
 2   height (cm)  3 non-null      float64
 3   gender       3 non-null      object 
dtypes: float64(1), int64(1), object(2)
memory usage: 224.0+ bytes


In [4]:
df[['height (cm)']] # prints out the height column

Unnamed: 0,height (cm)
0,75.2
1,62.7
2,65.3


In [5]:
df.iloc[:,[2,3,3]] # accesses by integer location (prints 3rd column (gender) twice)

Unnamed: 0,height (cm),gender,gender.1
0,75.2,M,M
1,62.7,F,F
2,65.3,F,F


In [6]:
df.filter(regex='ei') # filters out columns with 'ei' in the name

Unnamed: 0,height (cm)
0,75.2
1,62.7
2,65.3


In [7]:
df[df['height (cm)'] > 70] # filters out rows where height is greater than 70

Unnamed: 0,id,name,height (cm),gender
0,28,Kyle,75.2,M


In [8]:
df.iloc[2] # prints out the 3rd row

id                  9
name           Claire
height (cm)      65.3
gender              F
Name: 2, dtype: object

In [9]:
type(df.iloc[2]) # prints out the type of the 3rd row (Series)

pandas.core.series.Series

In [10]:
df.iloc[::-1,2:] # prints out the DataFrame in reverse order with only the last two columns

Unnamed: 0,height (cm),gender
2,65.3,F
1,62.7,F
0,75.2,M


In [11]:
df.index = [56,23,1] # changes the index to 56, 23, 1

In [12]:
df # prints out the DataFrame with the new index

Unnamed: 0,id,name,height (cm),gender
56,28,Kyle,75.2,M
23,17,Erin,62.7,F
1,9,Claire,65.3,F


In [13]:
df.head() # prints out the first 5 rows

Unnamed: 0,id,name,height (cm),gender
56,28,Kyle,75.2,M
23,17,Erin,62.7,F
1,9,Claire,65.3,F


In [14]:
df.tail() # prints out the last 5 rows

Unnamed: 0,id,name,height (cm),gender
56,28,Kyle,75.2,M
23,17,Erin,62.7,F
1,9,Claire,65.3,F


In [15]:
df.tail(2) # prints out the last 2 rows

Unnamed: 0,id,name,height (cm),gender
23,17,Erin,62.7,F
1,9,Claire,65.3,F


In [16]:
df.sample(2) # prints out 2 random rows

Unnamed: 0,id,name,height (cm),gender
23,17,Erin,62.7,F
56,28,Kyle,75.2,M


In [17]:
df.sample(frac=.70) # prints out 70% of the rows

Unnamed: 0,id,name,height (cm),gender
56,28,Kyle,75.2,M
1,9,Claire,65.3,F


In [18]:
df.query('name=="Erin"') # prints out the row where the name is Erin

Unnamed: 0,id,name,height (cm),gender
23,17,Erin,62.7,F


In [19]:
df[df.name=="Erin"] # does the same thing as the query above

Unnamed: 0,id,name,height (cm),gender
23,17,Erin,62.7,F


In [20]:
df.sort_values("name", ascending=False) # sorts the DataFrame by name in descending order

Unnamed: 0,id,name,height (cm),gender
56,28,Kyle,75.2,M
23,17,Erin,62.7,F
1,9,Claire,65.3,F


In [21]:
df.rename(columns={"height (cm)": "height_cm"}, inplace=True) # renames the column height (cm) to height_cm

In [22]:
df # prints out the DataFrame with the new column name

Unnamed: 0,id,name,height_cm,gender
56,28,Kyle,75.2,M
23,17,Erin,62.7,F
1,9,Claire,65.3,F


In [23]:
df.drop(["gender"], axis=1) # drops the gender column

Unnamed: 0,id,name,height_cm
56,28,Kyle,75.2
23,17,Erin,62.7
1,9,Claire,65.3


In [24]:
df.drop(columns="gender") # does the same thing as the drop above

Unnamed: 0,id,name,height_cm
56,28,Kyle,75.2
23,17,Erin,62.7
1,9,Claire,65.3


In [25]:
df.iloc[2,2] = None # changes the value in the 3rd row, 3rd column to None
df

Unnamed: 0,id,name,height_cm,gender
56,28,Kyle,75.2,M
23,17,Erin,62.7,F
1,9,Claire,,F


In [26]:
df.dropna() # drops rows with missing values

Unnamed: 0,id,name,height_cm,gender
56,28,Kyle,75.2,M
23,17,Erin,62.7,F


In [27]:
df.dropna(axis=0, subset=["height_cm"]) # drops rows with missing values in the height_cm column

Unnamed: 0,id,name,height_cm,gender
56,28,Kyle,75.2,M
23,17,Erin,62.7,F


In [28]:
df.fillna(65) # fills in missing values with 65

Unnamed: 0,id,name,height_cm,gender
56,28,Kyle,75.2,M
23,17,Erin,62.7,F
1,9,Claire,65.0,F


In [29]:
df2 = df.copy() # creates a copy of the DataFrame

In [30]:
pd.concat([df,df2],axis=1) # concatenates two DataFrames side by side

Unnamed: 0,id,name,height_cm,gender,id.1,name.1,height_cm.1,gender.1
56,28,Kyle,75.2,M,28,Kyle,75.2,M
23,17,Erin,62.7,F,17,Erin,62.7,F
1,9,Claire,,F,9,Claire,,F


In [31]:
# Create a second DataFrame
df2 = pd.DataFrame({
    "id":[29,17,9],
    "name":["Kyle","Erin","Claire"],
    "height (cm)": [75.2,62.7,65.3],
    "gender":["M","F","F"]
})
df2.set_index("id", inplace=True) # sets the index to the id column

In [32]:
df2

Unnamed: 0_level_0,name,height (cm),gender
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
29,Kyle,75.2,M
17,Erin,62.7,F
9,Claire,65.3,F


In [33]:
df

Unnamed: 0,id,name,height_cm,gender
56,28,Kyle,75.2,M
23,17,Erin,62.7,F
1,9,Claire,,F


In [34]:
pd.concat([df,df2],axis=1) # concatenates the two DataFrames side by side

Unnamed: 0,id,name,height_cm,gender,name.1,height (cm),gender.1
56,28.0,Kyle,75.2,M,,,
23,17.0,Erin,62.7,F,,,
1,9.0,Claire,,F,,,
29,,,,,Kyle,75.2,M
17,,,,,Erin,62.7,F
9,,,,,Claire,65.3,F
