Series can be created from Python dict, arrays, or scalar values (like 1, 2, 3, etc.).
To create a series, we can use the .Series() function.

In [1]:
%pip install pandas
%pip install numpy
%pip install seaborn
%pip install matplotlib



In [5]:
file_path = '/content/sample_data/california_housing_test.csv'
df = pd.read_csv(file_path)


In [6]:
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,-122.05,37.37,27.0,3885.0,661.0,1537.0,606.0,6.6085,344700.0
1,-118.3,34.26,43.0,1510.0,310.0,809.0,277.0,3.599,176500.0
2,-117.81,33.78,27.0,3589.0,507.0,1484.0,495.0,5.7934,270500.0
3,-118.36,33.82,28.0,67.0,15.0,49.0,11.0,6.1359,330000.0
4,-119.67,36.33,19.0,1241.0,244.0,850.0,237.0,2.9375,81700.0


In [2]:
import pandas as pd

# creating series
s1 = pd.Series(["apple", "banana", "lemon", "mango"])

# creating series from a dict
s2 = pd.Series({"A": 1, "B": 2, "C": 3, "D": 4})

# .Series() has index and name parameters that we can customize as needed.
s3 = pd.Series([1, 2, 3, 4, 5], index=["A", "B", "C", "D", "E"], name="simple series")

print(s1)
print(s2)
print(s3)

0     apple
1    banana
2     lemon
3     mango
dtype: object
A    1
B    2
C    3
D    4
dtype: int64
A    1
B    2
C    3
D    4
E    5
Name: simple series, dtype: int64


In [7]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


DataFrame can be created from a Python dictionary, list, 2D array, object series, etc. Create using the .DataFrame() function.

In [8]:
path = '/content/drive/MyDrive/Colab Notebooks/data/large_text_dataset.csv'
df = pd.read_csv(path)

In [9]:
df.head()

Unnamed: 0,text,label
0,The movie was fantastic!,1
1,I did not like the film.,0
2,An amazing performance by the actor.,1
3,The plot was dull and boring.,0
4,It was a wonderful experience.,1


In [11]:
import pandas as pd
pd.__version__

'2.2.2'

In [3]:
import pandas as pd

# creating an empty DataFrame object and then add columns and rows
df1 = pd.DataFrame()
df1["Name"] = ["Takashi", "Jason", "Nakamura", "Olivia"]
df1["Grade"] = [100, 90, 70, 80]

# creating from a dictionary
dictionary = {"Name": ["Takashi", "Jason", "Nakamura", "Olivia"], "Grade": [100, 90, 70, 80]}
df2 = pd.DataFrame(dictionary)

# creating from a list, in which we then specify the column names
a = [90, 100, 95]
b = [81, 89, 85]
c = [75, 70, 77]
df3 = pd.DataFrame([a,b,c], columns=list("ABC"))

# creating from csv
df4 = pd.read_csv("data/prep-data01.csv")
print(df4.describe())
print(df4.describe(include="all"))
print(df4.info())

FileNotFoundError: [Errno 2] No such file or directory: 'data/prep-data01.csv'

# Working with Rows and Columns
We will create a dummy dataset from a random integer number using the NumPy package.

In [None]:
import pandas as pd
import numpy as np

np.random.seed(1)
data = np.random.randint(0, 20, size=(5, 4)) # 5 rows and 4 columns
df = pd.DataFrame(data, columns=list("ABCD"))
# print(df)

# get brief information
# print(df.index)
# print(df.columns)

# copy df and drop columns C and D
dfDrop = df.copy()
dfDrop.drop(columns=["C", "D"], inplace=True)
print(dfDrop)

# copy df and add new column E with numpy random
dfAdd = df.copy()
dfAdd["E"] = np.random.randint(0, 20, size=(5, 1))
print(dfAdd)


In [None]:
import pandas as pd
import numpy as np
np.random.seed(1)
data = np.random.randint(0, 20, size=(5, 4)) # 5 rows and 4 columns
df = pd.DataFrame(data, columns=list("ABCD"))
print(df)

print(df['A'])
print(df.loc[0]) # label = 0 (label can be string)
print(df.iloc[0])
print(df[0:2])
print(df['A'] > 5)

# Splitting Data

In [None]:
import pandas as pd
import numpy as np
np.random.seed(1)
data = np.random.randint(0, 20, size=(5, 4)) # 5 rows and 4 columns
df = pd.DataFrame(data, columns=list("ABCD"))
print(df, '\n')

# .iloc[row index, column index], : means all
df1 = df.iloc[:2 , :] # :2 means splitting rows from index 0 to 2
print(df1, '\n')

df2 = df.iloc[2: , :] # 2: means splitting rows from index 3 to n
print(df2, '\n')

# .iloc[row index, column index], : means all
df3 = df.iloc[: , :2] # :2 means splitting columns from index 0 to 2
print(df3, '\n')

df4 = df.iloc[: , 2:] # 2: means splitting columns from index 3 to n
print(df4, '\n')

# .groupby() function is used to split the DataFrame based on some values
grouped = df.groupby(df.A)
# we can select specified groups using the get_group() function
df5 = grouped.get_group(5)
print(df5, '\n')

# n: int value, Number of random rows to generate.
df6 = df.sample(n = 2)
print(df6, '\n')

# frac: Float value, Returns (float value * length of data frame values ). frac cannot be used with n.
df7 = df.sample(frac = 0.5)
print(df7, '\n')


# Merge Data
1. Concatenating Data

In [None]:
import pandas as pd

left = pd.DataFrame({"A": ["A0", "A1", "A2"], "B": ["B0", "B1", "B2"]}, index=["K0", "K1", "K2"])
right = pd.DataFrame({"C": ["C0", "C2", "C3"], "D": ["D0", "D2", "D3"]}, index=["K0", "K2", "K3"])
df1 = pd.concat([left, right])

# axis=0 to concat along rows, axis=1 to concat along columns.
df2 = pd.concat([left, right], axis=0)
df3 = pd.concat([left, right], axis=1)

print(df1, '\n')
print(df2, '\n')
print(df3, '\n')

2. Merging Data
2.1 merge using index

In [None]:
import pandas as pd

df1 = pd.DataFrame({'Courses': ["Spark", "PySpark", "Python", "pandas"],
                    'Fee': [20000, 25000, 22000, 24000]},
                    index=['r1', 'r2', 'r3', 'r4'])

df2 = pd.DataFrame({'Duration': ['30day', '40days', '35days', '60days', '55days'],
                    'Discount': [1000, 2300, 2500, 2000, 3000]},
                    index=['r1', 'r2', 'r3', 'r5', 'r6'])

# Merge two DataFrames by index using pandas.merge()
df3 = pd.merge(df1, df2, left_index=True, right_index=True)

print(df1, '\n')
print(df2, '\n')
print(df3, '\n')

2.2 merge using column name

In [None]:
import pandas as pd

df1 = pd.DataFrame({'Courses': ["Spark", "PySpark", "Python", "pandas"],
                    'Fee': [20000, 25000, 22000, 24000],
                    'Name': ["John", "James", "Peter", "David"]},
                    index=['r1', 'r2', 'r3', 'r4'])

df2 = pd.DataFrame({'Duration': ['30day', '40days', '35days', '60days', '55days'],
                    'Discount': [1000, 2300, 2500, 2000, 3000],
                    'Name': ["John", "Billy", "Marcus", "James", "David"]},
                    index=['r1', 'r2', 'r3', 'r5', 'r6'])

# Merge two DataFrames by column using pandas.merge()
df3 = pd.merge(df1, df2, on="Name")

print(df1, '\n')
print(df2, '\n')
print(df3, '\n')

3. Joining Data

In [None]:
import pandas as pd

df1 = pd.DataFrame({"A": ["A0", "A1", "A2"], "B": ["B0", "B1", "B2"]},
                    index=["K0", "K1", "K2"])

df2 = pd.DataFrame({"C": ["C0", "C2", "C3"], "D": ["D0", "D2", "D3"]},
                     index=["K0", "K2", "K3"])

df3 = df1.join(df2)
df4 = df2.join(df1)

resultInner = df1.join(df2, how="inner")
resultOuter = df1.join(df2, how="outer")

print(df1, '\n')
print(df2, '\n')
print(df3, '\n')
print(df4, '\n')

print(resultInner, '\n')
print(resultOuter, '\n')



# Missing Values Handling

# Outliers Detection

In [None]:
import pandas as pd

# 1. Load the dataset
df = pd.read_csv("data/prep-data02.csv")
print(df.head(), '\n')

# # 2. Check the datatype for each column
print(df.dtypes, '\n')

# 3. Data Summary
print(df.describe(include='all').to_string(), '\n')

# 4. Finding the missing values
# # 4.1 Check for missing data in any of the variables.
print(df.info(), '\n')

# # 4.2 Finding the missing values in each column
print(df.isnull().sum(), '\n')

# 5. Handling Missing Values
# # 5.1 Drop missing values
datadrop = df.dropna()
print(datadrop.describe(include='all').to_string(), '\n')

# # 5.2 Fill missing values with Mean/Meidan/Mode
# for numerical data (mean/median)
df1 = df.copy()
df1.fillna({'Temperature': df.Temperature.mean()}, inplace=True)
df1.fillna({'Humidity': df.Humidity.mean()}, inplace=True)
df1.fillna({'Airflow': df.Airflow.median()}, inplace=True)

# for categorical data (mode)
df1.fillna({'Weather Condition': df['Weather Condition'].mode()[0]}, inplace=True)

print(df1.describe(include='all').to_string(), '\n')

# # 5.3 Fill missing values with Forward Fill/Backward Fill
df2 = df.copy()
df2.ffill(inplace=True)

df3 = df.copy()
df3.bfill(inplace=True)

print(df2.head(), '\n')
print(df3.head(), '\n')

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
# 1. Load the dataset
df = pd.read_csv("data/prep-data03.csv")

# 2. Show outlier of each column in df using boxplot
plt.figure(figsize=(10, 5))
sns.boxplot(data=df)
plt.title("Boxplot of df before eliminating outliers")
plt.show()

# 3. Eliminate outliers of each column in df using IQR
Q1 = df['Temperature'].quantile(0.25)
Q3 = df['Temperature'].quantile(0.75)
IQR = Q3 - Q1
df = df[(df['Temperature'] > (Q1 - 1.5 * IQR)) & (df['Temperature'] < (Q3 + 1.5 * IQR))]

plt.figure(figsize=(10, 5))
sns.boxplot(data=df)
plt.title("Boxplot of df after eliminating outliers")
plt.show()

