# Import Pandas and Numpy libraries with conventional aliases

In [3]:
import pandas as pd
import numpy as np

# Different ways of creating a Pandas DataFrame

## 1. Create DataFrame from dictionary of lists

In [4]:
dictionary_of_lists_1 = {
    'Name': ['Emma', 'Oliver', 'Harry', 'Sophia'], 
    'Age': [29, 25, 33, 24],
    'Department': ['HR', 'Finance', 'Marketing', 'IT']}
df1 = pd.DataFrame(dictionary_of_lists_1)
print ("\nBasic DataFrame with only column names\n")
print (df1)
print (type(df1))


dictionary_of_lists_2 = {
    'apples': [3, 2, 0, 1], 
    'oranges': [0, 3, 7, 2]
}
df2 = pd.DataFrame(dictionary_of_lists_2, index=['June', 'Robert', 'Lily', 'David'])
print ("\nBasic DataFrame with index names specified\n")
print (df2)



Basic DataFrame with only column names

     Name  Age Department
0    Emma   29         HR
1  Oliver   25    Finance
2   Harry   33  Marketing
3  Sophia   24         IT
<class 'pandas.core.frame.DataFrame'>

Basic DataFrame with index names specified

        apples  oranges
June         3        0
Robert       2        3
Lily         0        7
David        1        2



## 2. Create DataFrame from simple 2D NumPy array

In [5]:
simple2Darray = np.random.randint(10, size=(4,3))
print ("\n2D Numpy array\n")
print(simple2Darray)
print ("\nBasic DataFrame\n")
df3 = pd.DataFrame(simple2Darray)
print(df3)
print ("\nDataFrame with column names specified\n")
df4 = pd.DataFrame(simple2Darray, columns=['firstcol', 'secondcol', 'thirdcol'])
print(df4)



2D Numpy array

[[5 1 8]
 [2 2 8]
 [9 3 9]
 [2 4 4]]

Basic DataFrame

   0  1  2
0  5  1  8
1  2  2  8
2  9  3  9
3  2  4  4

DataFrame with column names specified

   firstcol  secondcol  thirdcol
0         5          1         8
1         2          2         8
2         9          3         9
3         2          4         4


## 3. Create DataFrame from list of dictionaries

In [6]:
list_of_dictionaries = [
    {'Name': 'Emma', 'Age': 29, 'Department': 'HR'},
    {'Name': 'Oliver', 'Age': 25, 'Department': 'Finance'},
    {'Name': 'Harry', 'Age': 33, 'Department': 'Marketing'},
    {'Name': 'Sophia', 'Age': 24, 'Department': 'IT'}]
df5 = pd.DataFrame(list_of_dictionaries, index=['Employee1', 'Employee2', 'Employee3', 'Employee4'])
print (df5)

             Name  Age Department
Employee1    Emma   29         HR
Employee2  Oliver   25    Finance
Employee3   Harry   33  Marketing
Employee4  Sophia   24         IT


## 4. Create DataFrame from reading from CSV file

In [7]:
customers = pd.read_csv("customers.csv")

# Working with a DataFrame

## 1. Viewing a number of first and last rows from a large DataFrame

In [8]:
print ("\nShowing the first 5 rows\n")
customers.head()



Showing the first 5 rows



Unnamed: 0,CustomerID,Gender,Age,Income,Score
0,1,Male,19.0,15,39
1,2,Male,21.0,15,81
2,3,Female,20.0,16,6
3,4,Female,,16,77
4,5,Female,31.0,17,40


In [9]:
print ("\nShowing the last 5 rows\n")
customers.tail()


Showing the last 5 rows



Unnamed: 0,CustomerID,Gender,Age,Income,Score
195,196,Female,35.0,120,79
196,197,Female,45.0,126,28
197,198,Male,32.0,126,74
198,199,Male,32.0,137,18
199,200,Male,30.0,137,83


In [10]:
print ("\nShowing a specified number of rows from the start\n")
customers.head(15)


Showing a specified number of rows from the start



Unnamed: 0,CustomerID,Gender,Age,Income,Score
0,1,Male,19.0,15,39
1,2,Male,21.0,15,81
2,3,Female,20.0,16,6
3,4,Female,,16,77
4,5,Female,31.0,17,40
5,6,Female,22.0,17,76
6,7,Female,35.0,18,6
7,8,Female,23.0,18,94
8,9,,64.0,19,3
9,10,Female,30.0,19,72


## 2. Getting general overview on dataset

In [11]:
customers.shape

(200, 5)

In [12]:
customers.columns

Index(['CustomerID', 'Gender', 'Age', 'Income', 'Score'], dtype='object')

In [13]:
customers.index

RangeIndex(start=0, stop=200, step=1)

In [14]:
customers.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   CustomerID  200 non-null    int64  
 1   Gender      198 non-null    object 
 2   Age         197 non-null    float64
 3   Income      200 non-null    int64  
 4   Score       200 non-null    int64  
dtypes: float64(1), int64(3), object(1)
memory usage: 7.9+ KB


In [15]:
customers.describe()

Unnamed: 0,CustomerID,Age,Income,Score
count,200.0,197.0,200.0,200.0
mean,100.5,38.918782,60.56,50.2
std,57.879185,14.020157,26.264721,25.823522
min,1.0,18.0,15.0,1.0
25%,50.75,29.0,41.5,34.75
50%,100.5,36.0,61.5,50.0
75%,150.25,49.0,78.0,73.0
max,200.0,70.0,137.0,99.0


## 3. Accessing rows and columns

In [16]:
list_of_dictionaries = [
    {'Name': 'Emma', 'Age': 29, 'Department': 'HR'},
    {'Name': 'Oliver', 'Age': 25, 'Department': 'Finance'},
    {'Name': 'Harry', 'Age': 33, 'Department': 'Marketing'},
    {'Name': 'Sophia', 'Age': 24, 'Department': 'IT'}]
df = pd.DataFrame(list_of_dictionaries, index=['Employee1', 'Employee2', 'Employee3', 'Employee4'])
print (df)

             Name  Age Department
Employee1    Emma   29         HR
Employee2  Oliver   25    Finance
Employee3   Harry   33  Marketing
Employee4  Sophia   24         IT


In [17]:
name_col = df['Name']
print(name_col)
print(type(name_col))


Employee1      Emma
Employee2    Oliver
Employee3     Harry
Employee4    Sophia
Name: Name, dtype: object
<class 'pandas.core.series.Series'>


In [18]:
multiple_cols = df[['Name','Department']]
print (multiple_cols)
print (type(multiple_cols))

             Name Department
Employee1    Emma         HR
Employee2  Oliver    Finance
Employee3   Harry  Marketing
Employee4  Sophia         IT
<class 'pandas.core.frame.DataFrame'>


In [19]:
single_row = df.loc["Employee2"]
print (single_row)
print (type(single_row))

Name           Oliver
Age                25
Department    Finance
Name: Employee2, dtype: object
<class 'pandas.core.series.Series'>


In [20]:
single_row = df.iloc[3]
print (single_row)
print (type(single_row))

Name          Sophia
Age               24
Department        IT
Name: Employee4, dtype: object
<class 'pandas.core.series.Series'>


## 4. Slicing rows and columns

In [21]:
list_of_dictionaries = [
    {'Name': 'Emma', 'Age': 29, 'Department': 'HR'},
    {'Name': 'Oliver', 'Age': 25, 'Department': 'Finance'},
    {'Name': 'Harry', 'Age': 33, 'Department': 'Marketing'},
    {'Name': 'Sophia', 'Age': 24, 'Department': 'IT'}]
df = pd.DataFrame(list_of_dictionaries, index=['Employee1', 'Employee2', 'Employee3', 'Employee4'])
print (df)

             Name  Age Department
Employee1    Emma   29         HR
Employee2  Oliver   25    Finance
Employee3   Harry   33  Marketing
Employee4  Sophia   24         IT


In [22]:
multiple_rows = df['Employee1':'Employee3']
print (multiple_rows)

             Name  Age Department
Employee1    Emma   29         HR
Employee2  Oliver   25    Finance
Employee3   Harry   33  Marketing


In [23]:
multiple_rows = df.iloc[1:3]
print (multiple_rows)

             Name  Age Department
Employee2  Oliver   25    Finance
Employee3   Harry   33  Marketing


In [24]:
print ("\nFrom the starting row until one row before row #8\n")
customers.iloc[:8]


From the starting row until one row before row #8



Unnamed: 0,CustomerID,Gender,Age,Income,Score
0,1,Male,19.0,15,39
1,2,Male,21.0,15,81
2,3,Female,20.0,16,6
3,4,Female,,16,77
4,5,Female,31.0,17,40
5,6,Female,22.0,17,76
6,7,Female,35.0,18,6
7,8,Female,23.0,18,94


In [25]:
print ("\nAll the rows from column 1 to 3\n")
customers.iloc[: , 1: 4]


All the rows from column 1 to 3



Unnamed: 0,Gender,Age,Income
0,Male,19.0,15
1,Male,21.0,15
2,Female,20.0,16
3,Female,,16
4,Female,31.0,17
...,...,...,...
195,Female,35.0,120
196,Female,45.0,126
197,Male,32.0,126
198,Male,32.0,137


In [26]:
print ("\nFrom row 2 to 4 and from column 0 to 2\n")
customers.iloc[2 : 5 , 0: 3]


From row 2 to 4 and from column 0 to 2



Unnamed: 0,CustomerID,Gender,Age
2,3,Female,20.0
3,4,Female,
4,5,Female,31.0


In [27]:
finalCol = customers.iloc[: , -1]
print (finalCol)
print (type(finalCol))

0      39
1      81
2       6
3      77
4      40
       ..
195    79
196    28
197    74
198    18
199    83
Name: Score, Length: 200, dtype: int64
<class 'pandas.core.series.Series'>


In [28]:
allExceptLast = customers.iloc[ : , : -1]
print (allExceptLast)
print (type(allExceptLast))

     CustomerID  Gender   Age  Income
0             1    Male  19.0      15
1             2    Male  21.0      15
2             3  Female  20.0      16
3             4  Female   NaN      16
4             5  Female  31.0      17
..          ...     ...   ...     ...
195         196  Female  35.0     120
196         197  Female  45.0     126
197         198    Male  32.0     126
198         199    Male  32.0     137
199         200    Male  30.0     137

[200 rows x 4 columns]
<class 'pandas.core.frame.DataFrame'>
