In [1]:
import pandas as pd

## Column selection

In [2]:
data = {
    "Name": ["Jai", "Princi", "Gaurav", "Anuj"],
    "Age": [27, 24, 22, 32],
    "Address": ["Delhi", "Kanpur", "Allahabad", "Kannauj"],
    "Qualification": ["Msc", "MA", "MCA", "Phd"],
}

df = pd.DataFrame(data)

df

Unnamed: 0,Name,Age,Address,Qualification
0,Jai,27,Delhi,Msc
1,Princi,24,Kanpur,MA
2,Gaurav,22,Allahabad,MCA
3,Anuj,32,Kannauj,Phd


In [3]:
df[["Name", "Address"]]  # for selecting two columns

Unnamed: 0,Name,Address
0,Jai,Delhi
1,Princi,Kanpur
2,Gaurav,Allahabad
3,Anuj,Kannauj


row selection

In [4]:
df.loc[2]

Name                Gaurav
Age                     22
Address          Allahabad
Qualification          MCA
Name: 2, dtype: object

In [5]:
df.loc[2:5]

Unnamed: 0,Name,Age,Address,Qualification
2,Gaurav,22,Allahabad,MCA
3,Anuj,32,Kannauj,Phd


In [6]:
df.iloc[2]

Name                Gaurav
Age                     22
Address          Allahabad
Qualification          MCA
Name: 2, dtype: object

In [7]:
df.iloc[2:5]

Unnamed: 0,Name,Age,Address,Qualification
2,Gaurav,22,Allahabad,MCA
3,Anuj,32,Kannauj,Phd


__Question:__ df.loc vs df.iloc

In [10]:
import os

os.listdir(r"..\..\..\datasets")

FileNotFoundError: [WinError 3] The system cannot find the path specified: '..\\..\\..\\datasets'

In [11]:
df = pd.read_csv("..\..\datasets\Titanic_Disaster.csv")
df

FileNotFoundError: [Errno 2] No such file or directory: '..\\..\\datasets\\Titanic_Disaster.csv'

In [12]:
df = pd.read_csv(
    "..\..\datasets\Titanic_Disaster.csv", index_col="Survived"
)  # NOT Unique - not recommneded
df

FileNotFoundError: [Errno 2] No such file or directory: '..\\..\\datasets\\Titanic_Disaster.csv'

In [None]:
df = pd.read_csv("..\..\datasets\Titanic_Disaster.csv", index_col="Ticket")
df

In [None]:
df.loc[
    "STON/O2. 3101282"
]  # when index_col is given in df definition, df.loc and df.iloc will differ

In [None]:
df.iloc[2]

In [None]:
df["Name"]

In [None]:
df[["Name", "Sex"]]

Working with missing data

In [None]:
dict = {
    "First Score": [100, 90, np.nan, 95],
    "Second Score": [30, 45, 56, np.nan],
    "Third Score": [np.nan, 40, 80, 98],
}

df_dict = pd.DataFrame(dict)
df_dict

In [None]:
df_dict.isnull()

Filling missing values using fillna(), replace() and interpolate()

In [None]:
df_dict.fillna(0)

In [None]:
df_dict  # original df is not changed

In [None]:
df_dict.dropna()

In [None]:
df_dict  # orginal df is not changed

Iterating over the dataframe

In [None]:
for each in df_dict:
    print(each)

In [None]:
for each in df_dict.items():
    print(each, "\n")

In [None]:
for key, value in df_dict.items():
    print(key, "===>", value, "\n")  # complete column data is value

TO iterate on each row

In [None]:
for key, value in df_dict.iterrows():
    print(key, "===>", value, "\n")

In [None]:
# TODO - iterate rows

Pandas Series

In [None]:
data = np.array([1, 2, 3, 4, 45])
data

In [None]:
p_ser = pd.Series(data)
p_ser

In [None]:
my_list = [12, 23.231, True, False]
p_ser = pd.Series(my_list)

p_ser

In [None]:
p_ser[3]

In [None]:
p_ser[2:5]

In [None]:
ser = pd.Series(["a", "e", "i", "o", "u"])
ser

In [None]:
ser = pd.Series(["a", "e", "i", "o", "u"], index=[13, 14, 15, 16, 17])
ser

In [None]:
df = pd.read_csv("..\..\datasets\Titanic_Disaster.csv")
ser = pd.Series(df["Name"])
ser

In [None]:
ser.head()

In [None]:
ser.head(9)

In [None]:
ser.loc[3]

In [None]:
ser.isnull()

In [None]:
data = pd.Series([5, 2, 3, 7], index=["a", "b", "c", "d"])
data

In [None]:
data1 = pd.Series(
    [1, 6, 4, 9, 5, 6], index=["a", "b", "d", "e", "f"]
)  # assymetric length
data1

In [None]:
raw_data = {
    "first_name": ["Jason", "Molly", "Tina", "Jake", "Amy"],
    "last_name": ["Miller", "Jacobson", ".", "Milner", "Cooze"],
    "age": [42, 52, 36, 24, 73],
    "preTestScore": [4, 24, 31, ".", "."],
    "postTestScore": ["25,000", "94,000", 57, 62, 70],
}

In [None]:
df = pd.DataFrame(raw_data, columns=raw_data.keys())

In [None]:
df

In [None]:
dir(df)

### Save dataframe as csv in the working director


In [None]:
df.to_csv("pandas_created_file.csv")

### Load a csv

In [None]:
df = pd.read_csv("pandas_created_file.csv")
df

### Load a csv with no headers

In [None]:
df = pd.read_csv("pandas_created_file.csv", header=None)
df

### Load a csv while specifying column names


In [None]:
df = pd.read_csv(
    "pandas_created_file.csv",
    names=[
        "UID",
        "First Name",
        "Last Name",
        "Age",
        "Pre-Test Score",
        "Post-Test Score",
    ],
)
df

### Load a csv with setting the index column to UID


In [None]:
df = pd.read_csv(
    "pandas_created_file.csv",
    index_col="UID",
    names=[
        "UID",
        "First Name",
        "Last Name",
        "Age",
        "Pre-Test Score",
        "Post-Test Score",
    ],
)
df

In [None]:
df.loc[2]

### Load a csv while setting the index columns to First Name and Last Name


In [None]:
df = pd.read_csv(
    "pandas_created_file.csv",
    index_col=["First Name", "Last Name"],
    names=[
        "UID",
        "First Name",
        "Last Name",
        "Age",
        "Pre-Test Score",
        "Post-Test Score",
    ],
)
df

### Load a csv while specifying “.” as missing values


In [None]:
df = pd.read_csv("pandas_created_file.csv", na_values=["."])
pd.isnull(df)

### Load a csv while interpreting “,” in strings around numbers as thousands seperators


In [None]:
df = pd.read_csv("pandas_created_file.csv", thousands=",")
df