###### Dataframe created from data about passengers on the Titanic

In [0]:
# If not using Databricks Notebook install Pandas with
# pip install pandas

# Load library
import pandas as pd

# Create URL
url = 'https://raw.githubusercontent.com/chrisalbon/sim_data/master/titanic.csv'

# Load data
dataframe = pd.read_csv(url)

In [0]:
# View the first few lines of data
print(dataframe.head())

In [0]:
# Show dimensions
dataframe.shape

In [0]:
# Show info
dataframe.info()

In [0]:
# Show statistics
dataframe.describe()

###### Slicing DataFrames

In [0]:
# Select first row
dataframe.iloc[0]

In [0]:
# Select three rows
dataframe.iloc[1:4]

##### Selecting Rows Based on Conditionals


In [0]:
# Show top five rows where column 'sex' is 'male'
dataframe[dataframe['Sex'] == 'male'].head()

###### Sorting Values


In [0]:
# Sort the dataframe by age, show five rows
dataframe.sort_values(by=["Age"]).head()

In [0]:
# Filter rows, male aged 70 and over
dataframe[(dataframe['Sex'] == 'male') & (dataframe['Age'] >= 70)]

##### Replacing Values

In [0]:
# Replace "female" and "male" with "Woman" and "Man"
dataframe['Sex'].replace(["female", "male"], ["Woman", "Man"]).head()

In [0]:
# Replace values, show five rows
dataframe.replace(1, "One").head()

In [0]:
# Replace values, show five rows
dataframe.replace(r"1st", "First", regex=True).head()

##### Renaming Columns

In [0]:
# Rename column, show five rows
dataframe.rename(columns={'PClass': 'Passenger Class'}).head()

In [0]:
# Rename columns, show five rows
dataframe.rename(columns={'PClass': 'Passenger Class', 'Sex': 'Gender'}).head()

###### Finding the Minimum, Maximum, Sum, Average, and Count

In [0]:
# Calculate statistics
print('Maximum:', dataframe['Age'].max())
print('Minimum:', dataframe['Age'].min())
print('Mean:', dataframe['Age'].mean())
print('Sum:', dataframe['Age'].sum())
print('Count:', dataframe['Age'].count())

##### Finding Unique Values

In [0]:
# Select unique values
dataframe['Sex'].unique()

In [0]:
# Show counts
dataframe['Sex'].value_counts()

###### Handling Missing Values

In [0]:
## Select missing values, show the first five rows
dataframe[dataframe['Age'].isnull()].head()

##### Deleting a Column

In [0]:
# Delete column
dataframe.drop('Age', axis=1).head()

In [0]:
# Drop multiple columns once
dataframe.drop(['Age', 'Sex'], axis=1).head()

In [0]:
# Drop column by index
dataframe.drop(dataframe.columns[1], axis=1).head()

##### Deleting a Row

In [0]:
# Delete rows, show first five rows of output
dataframe[dataframe['Sex'] != 'male'].head()

###### Dropping Duplicate Rows


In [0]:
# Drop duplicates, show first five rows of output
dataframe.drop_duplicates().head()

###### Grouping Rows by Values

In [0]:
# Group rows by the values of the column 'Sex', calculate mean number of each group
dataframe.groupby('Sex').mean(numeric_only=True)

In [0]:
# Group rows, count rows
dataframe.groupby('Survived')['Name'].count()

In [0]:
# Group rows, calculate mean
dataframe.groupby(['Sex','Survived'])['Age'].mean()

###### Aggregating Operations and Statistics


In [0]:
# Get the minimum of every column
dataframe.agg("min")

In [0]:
# Mean Age, min and max SexCode
dataframe.agg({"Age":["mean"], "SexCode":["min", "max"]})

In [0]:
# Number of people who survived and didn't survive in each class
dataframe.groupby(
    ["PClass","Survived"]).agg({"Survived":["count"]}
  ).reset_index()

##### Looping over a column


In [0]:
# Print the first 5 names upper case
for name in dataframe['Name'][0:5]:
    print(name.upper())