In [3]:
import pandas as pd
# Pandas Series provides more flexibility and functionality when working with labeled, heterogeneous, or incomplete datasets,
# making it more suitable for real-world data analysis tasks compared to NumPy arrays, which are more focused on 
# performance and mathematical operations.
# 2 major data structures used in pandas are Series and DataFrame
# Both Series and DataFrame are critical data structures,
# but they serve slightly different purposes and have different characteristics
# 1) Series
# 2) DataFrame


In [3]:
# Creating a Series from a List
# By default, Pandas assigns a numeric index that starts from 0, similar to Python list indexing
# Parameter	Description                                                      	                      
# 1)data ( Required)  
#                   The data to be stored in the Series. It can be various types such as a list, 
#                   NumPy array, dictionary, scalar value,or another Pandas Series.	                            
# 2)index(optional---default integer index 0,1,2...)	
#                   The labels for the Series, which are optional. It should be the same length as the data.	
# 3) dtype None (inferred)	The data type of the Series. If not specified, Pandas will infer the data type from the input.	
# 4) name	(None) A name for the Series, useful when the Series is part of a DataFrame or for labeling purposes.
# 5) copy	(False)If True, the data is copied; otherwise, it’s a reference.	False
# 6) fastpath	This is an internal parameter used to bypass certain checks. Not typically used by users.

# Create a Series from a list
data = [10, 20, 30, 40, 50]
series = pd.Series(data)

print(series) # The first column represents the index of the Series
              #  The second column shows the actual data values in the Series

0    10
1    20
2    30
3    40
4    50
dtype: int64


In [79]:
# Creating a Series with a Custom Index

s1 = pd.Series([12,-4,7,9], index=['a','b','c','d'])
print(s1)

a    12
b    -4
c     7
d     9
dtype: int64


In [9]:
index = ['A', 'B', 'C', 'D', 'E']
series = pd.Series(data, index=index)

print(series)

A    1
B    2
C    3
D    4
E    5
dtype: int64


In [13]:
# Create a Series from a scalar value
scalar_series = pd.Series(5, index=[0, 1, 2, 3, 4])
print(scalar_series)

0    5
1    5
2    5
3    5
4    5
dtype: int64


In [7]:
# Accessing elements using index ( Indexing)
# .loc[] attribute is used to access elements by their  label-based indexing (custome index)
# In a Pandas Series, the .iloc[] attribute is used to access elements by their integer location, or positional index. 

s1 = pd.Series([65,66,67,68,69], index=['A','B','C','D','E']) # Series has a custom index
print(s1)
print("s1['A'] is", s1['A']) #   s1['A'] is 65
print("s1.loc['A'] is", s1.loc['A']) #   s1.loc['A'] is 65             label based indexing


print("s1.iloc[0] is", s1.iloc[0]) #   s1.iloc[0] is 65
print("s1.iloc[-5] is", s1.iloc[-5]) #   s1.iloc[-5] is 65


# To add an element to the existing series
s1['a']=97
s1.loc['b']=98

print("The modified series is:")
print(s1)

print("s1[0] is", s1[0]) #   s1[0] is 65

A    65
B    66
C    67
D    68
E    69
dtype: int64
s1['A'] is 65
s1.loc['A'] is 65
s1.iloc[0] is 65
s1.iloc[-5] is 65
The modified series is:
A    65
B    66
C    67
D    68
E    69
a    97
b    98
dtype: int64
s1[0] is 65


  print("s1[0] is", s1[0]) #   s1[0] is 65


In [3]:
# Filtering values using boolean indexing
import pandas as pd
import numpy as np
# Create a Series
ageSeries = pd.Series([25, 30, 35, 40, 45])
print("The series ages is:")
print(ageSeries,"\n")
# Filter to find all values greater than 30

older_than_30 = ageSeries[ageSeries > 30]
print(type(older_than_30))
print( "The people older than 30 are")
print(older_than_30)

The series ages is:
0    25
1    30
2    35
3    40
4    45
dtype: int64 

<class 'pandas.core.series.Series'>
The people older than 30 are
2    35
3    40
4    45
dtype: int64


In [73]:
#  Mathematical operations
s=pd.Series([4,6,-8,9])
print("The series after operation s/2 is:")
print(s/2)
print("The series log(s) is:")
np.log(s)

The series after operation s/2 is:
0    2.0
1    3.0
2   -4.0
3    4.5
dtype: float64
The series log(s) is:


  result = getattr(ufunc, method)(*inputs, **kwargs)


0    1.386294
1    1.791759
2         NaN
3    2.197225
dtype: float64

In [75]:
# unique() function: To know all the values contained within the Series excluding duplicates 

serd = pd.Series([1,0,2,1,2,3], index=['white','white','blue','green','green','yellow'])
print("The series is:")
print(serd, "\n")
uni_values=serd.unique() ; # 
print("type(uni_values):",type(uni_values))
print(uni_values)
uni_values

The series is:
white     1
white     0
blue      2
green     1
green     2
yellow    3
dtype: int64 

type(uni_values): <class 'numpy.ndarray'>
[1 0 2 3]


array([1, 0, 2, 3], dtype=int64)

In [79]:
# value_counts( ): This function not only returns the unique values but calculates occurrences within a Series.

newSeries=serd.value_counts()
print("Val  Count")
newSeries


Val  Count


1    2
2    2
0    1
3    1
Name: count, dtype: int64

In [75]:
# isin( ) is a function evaluates the membership, that is, given a list of values, this function shows 
# if these values are contained within the data structure. Boolean values that are returned
import pandas as pd

# Create a Series
data = pd.Series([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])

# Define the values to filter
values_to_check = [1, 3, 5, 7]
# Elements at positions 0, 2, 4, and 6 (1, 3, 5, 7) return True because they are in the list [1, 3, 5, 7].
# Use isin() to check which elements are in the specified list
filtered = data.isin(values_to_check)
print("The filtered list is:")
print(filtered)


The filtered list is:
0     True
1    False
2     True
3    False
4     True
5    False
6     True
7    False
8    False
9    False
dtype: bool


In [29]:
# np.NaN from the NumPy library, which stands for "Not a Number."
# It's a special floating-point value that represents missing data in numeric and non-numeric arrays alike.
# While constructing Series or DataFrame in Pandas, place np.NaN to indicate that a value is missing.
# Create a Series with explicit missing values
data = [1, 2, np.NaN, 4, 5, None]
series = pd.Series(data)
print(series)

# Pandas automatically converts integers to floats in this case, because np.NaN is a floating-point value, 
# and arrays in Pandas need to be of a uniform data type

0    1.0
1    2.0
2    NaN
3    4.0
4    5.0
5    NaN
dtype: float64


In [43]:
# Working with Missing Values
# The isnull( ) and notnull( ) functions are very useful to identify the indexes without a value.
# Use isna() or isnull() to check for missing values, which returns a Boolean array.
print("The series is")
print(series)
# Check for missing values
print("\nReturn type of isnull(), i.e., type(series.isnull()):",type(series.isnull())) #  bool Series type
print("\nShow missing values using series.isnull()")
bool_series_missing_values=series.isnull()
print(bool_series_missing_values)

print("\nnotnull() output")
print(series.notnull())

# Fill missing values with a specified value
print("\nfilled_series after filling 0 to missing values using filled_series = series.fillna(0)")
filled_series = series.fillna(0)
print(filled_series)


# Use dropna() to remove rows or columns containing missing values
clean_series = series.dropna()
print("clean_series after removing missing values using clean_series = series.dropna()")
print(clean_series)



The series is
0    1.0
1    2.0
2    NaN
3    4.0
4    5.0
5    NaN
dtype: float64

Return type of isnull(), i.e., type(series.isnull()): <class 'pandas.core.series.Series'>

Show missing values using series.isnull()
0    False
1    False
2     True
3    False
4    False
5     True
dtype: bool

notnull() output
0     True
1     True
2    False
3     True
4     True
5    False
dtype: bool

filled_series after filling 0 to missing values using filled_series = series.fillna(0)
0    1.0
1    2.0
2    0.0
3    4.0
4    5.0
5    0.0
dtype: float64
clean_series after removing missing values using clean_series = series.dropna()
0    1.0
1    2.0
3    4.0
4    5.0
dtype: float64


In [97]:
# Defining Series from NumPy Arrays 
import numpy as np
arr = np.array([1,2,3,4])
s3 = pd.Series(arr)
print(s3)

0    1
1    2
2    3
3    4
dtype: int32


In [45]:
# Creating a Series from a Dictionary
#the array of index is filled with the values of the keys while the data with the corresponding values. 

data_dict = {'a': 1, 'b': 2, 'c': 3, 'd':4}
series = pd.Series(data_dict)
print("The series is:")
print(series)

print("\nSeries.values=",series.values)
print("Series.index=",series.index)
print("The value of series['a'] is",series['a'])
print("series[0:2]=")
print(series[0:2])


The series is:
a    1
b    2
c    3
d    4
dtype: int64

Series.values= [1 2 3 4]
Series.index= Index(['a', 'b', 'c', 'd'], dtype='object')
The value of series['a'] is 1
series[0:2]=
a    1
b    2
dtype: int64


In [115]:
mydict = {'red': 2000, 'blue': 1000, 'yellow': 500, 'orange': 1000}
myseries1 = pd.Series(mydict)
print(myseries1)

red       2000
blue      1000
yellow     500
orange    1000
dtype: int64


In [121]:
colors = ['red','yellow','orange','blue','green','violet']
myseries2 = pd.Series(mydict, index=colors)
print(myseries2)

red       2000.0
yellow     500.0
orange    1000.0
blue      1000.0
green        NaN
violet       NaN
dtype: float64


In [9]:
# opearions between series
import pandas as pd

# Create two Series
s1 = pd.Series([1, 2, 3, 4, 5,6])
s2 = pd.Series([10, 20, 30, 40, 50])

# Arithmetic operations  ( element by element operaion)
add = s1 + s2
sub = s1 - s2
mul = s1 * s2
div = s2 / s1

print("Addition:\n", add)
print("Subtraction:\n", sub)
print("Multiplication:\n", mul)
print("Division:\n", div)


Addition:
 0    11.0
1    22.0
2    33.0
3    44.0
4    55.0
5     NaN
dtype: float64
Subtraction:
 0    -9.0
1   -18.0
2   -27.0
3   -36.0
4   -45.0
5     NaN
dtype: float64
Multiplication:
 0     10.0
1     40.0
2     90.0
3    160.0
4    250.0
5      NaN
dtype: float64
Division:
 0    10.0
1    10.0
2    10.0
3    10.0
4    10.0
5     NaN
dtype: float64


In [11]:
# Broadcasting Operations ,Scalar multiplication
print(s1)
scalar_prod = s1 * 2
print("Scalar Multiplication s1 * 2:\n", scalar_prod)

0    1
1    2
2    3
3    4
4    5
5    6
dtype: int64
Scalar Multiplication s1 * 2:
 0     2
1     4
2     6
3     8
4    10
5    12
dtype: int64


In [123]:
# Operations between Series
# Handling Mismatched Indexes
mydict1= {'red': 2000, 'blue': 1000, 'yellow': 500, 'orange': 1000}
s1 = pd.Series(mydict1)

mydict2 = {'red':400,'yellow':1000,'black':700}
s2 = pd.Series(mydict2)
s1 + s2

black        NaN
blue         NaN
orange       NaN
red       2400.0
yellow    1500.0
dtype: float64

In [127]:
# Statistical operations
print("Sum:", s1.sum())
print("Mean:", s1.mean())
print("Median:", s1.median())
print("Standard Deviation:", s1.std())


Sum: 15
Mean: 3.0
Median: 3.0
Standard Deviation: 1.5811388300841898


In [129]:
# Handling missing data
s_with_nan = pd.Series([1, 2, np.nan, 4, np.nan])
print("Original Series with NaN:\n", s_with_nan)
print("Is NaN:\n", s_with_nan.isna())
print("Filled NaN with zero:\n", s_with_nan.fillna(0))


Original Series with NaN:
 0    1.0
1    2.0
2    NaN
3    4.0
4    NaN
dtype: float64
Is NaN:
 0    False
1    False
2     True
3    False
4     True
dtype: bool
Filled NaN with zero:
 0    1.0
1    2.0
2    0.0
3    4.0
4    0.0
dtype: float64


In [13]:
# Other operations
s1 = pd.Series([1, 2, 3, 4, 5,6])
s2 = pd.Series([10, 20, 30, 40, 50, 1])

concatenated = pd.concat([s1, s2])
print("Concatenated Series:\n", concatenated)
print("Concatenated.iloc[0] is",concatenated.iloc[0])
print("Concatenated.iloc[6] is",concatenated.iloc[6])
print("Sorted Series:\n", concatenated.sort_values(ascending=False))
print("Unique Values in S1:", concatenated.unique())
print("Value Counts in S1:\n", concatenated.value_counts())


Concatenated Series:
 0     1
1     2
2     3
3     4
4     5
5     6
0    10
1    20
2    30
3    40
4    50
5     1
dtype: int64
Concatenated.iloc[0] is 1
Concatenated.iloc[6] is 10
Sorted Series:
 4    50
3    40
2    30
1    20
0    10
5     6
4     5
3     4
2     3
1     2
0     1
5     1
dtype: int64
Unique Values in S1: [ 1  2  3  4  5  6 10 20 30 40 50]
Value Counts in S1:
 1     2
2     1
3     1
4     1
5     1
6     1
10    1
20    1
30    1
40    1
50    1
Name: count, dtype: int64


In [117]:
# Custom Functions with apply

s1=pd.Series([4,1,2,3])
# Custom operation
def custom_op(value):
    return value * 2 + 3

custom_series = s1.apply(custom_op)
print("Custom Operation with apply():\n", custom_series)


Custom Operation with apply():
 0    11
1     5
2     7
3     9
dtype: int64


In [121]:
# Logical operations
s1=pd.Series([4,1,2,3])
greater_than_two = s1 > 2
print("Greater than 2:\n", greater_than_two)


Greater than 2:
 0     True
1    False
2    False
3     True
dtype: bool


In [285]:
#  Creating DataFrame From a Dictionary of Lists

data = {'color' :  ['blue','green','yellow','red','white'],
        'object' : ['ball','pen','pencil','paper','paper'],
        'price' :  [1.2,    1.0,    0.6,    0.9,   1.7]
       }

frame1 = pd.DataFrame(data)
print(frame1)

    color  object  price
0    blue    ball    1.2
1   green     pen    1.0
2  yellow  pencil    0.6
3     red   paper    0.9
4   white   paper    1.7


In [289]:
frame2 = pd.DataFrame(data, index=['zero','one','two','three','four'])
print(frame2, "\n")
# Selecting Elements
print("frame2.index:")
print(frame2.index, "\n")
print("frame2.columns:")
print(frame2.columns, "\n")
print("frame2.values:")
print(frame2.values , "\n")
# display a column
print("Only price column: using frame2['price']")
print(frame2['price'], "\n")     

print(frame2, "\n")

# display a row
print("Row 2: frame2.iloc[2] is") 
print(frame2.iloc[2], "\n")   

print("Row 2: frame2.loc['two'] is") 
print(frame2.loc['two'], "\n")

# Access the row where object is "paper"

print("Access the row(s) where object is 'paper'using ---frame2[frame2['object'] == 'paper']")
print(frame2[frame2['object'] == 'paper'],"\n")

# if you know the index label and column:
print("frame2.at['three', 'object'] is", frame2.at['three', 'object'], "\n")


print("frame2['object']['three'] is", frame2['object']['three'], "\n")
# print("frame2['object'][3] is ",frame2['object'][3], "\n")

print("frame2[0:1]:")
print(frame2[0:1], "\n")

print("frame2[1:3]:")
print(frame2[1:3], "\n")


        color  object  price
zero     blue    ball    1.2
one     green     pen    1.0
two    yellow  pencil    0.6
three     red   paper    0.9
four    white   paper    1.7 

frame2.index:
Index(['zero', 'one', 'two', 'three', 'four'], dtype='object') 

frame2.columns:
Index(['color', 'object', 'price'], dtype='object') 

frame2.values:
[['blue' 'ball' 1.2]
 ['green' 'pen' 1.0]
 ['yellow' 'pencil' 0.6]
 ['red' 'paper' 0.9]
 ['white' 'paper' 1.7]] 

Only price column: using frame2['price']
zero     1.2
one      1.0
two      0.6
three    0.9
four     1.7
Name: price, dtype: float64 

        color  object  price
zero     blue    ball    1.2
one     green     pen    1.0
two    yellow  pencil    0.6
three     red   paper    0.9
four    white   paper    1.7 

Row 2: frame2.iloc[2] is
color     yellow
object    pencil
price        0.6
Name: two, dtype: object 

Row 2: frame2.loc['two'] is
color     yellow
object    pencil
price        0.6
Name: two, dtype: object 

Access the row(s) where o

In [201]:
print("frame2  before:")
print(frame2, "\n")
frame2.index.name = 'id'; frame2.columns.name = 'item'
print("frame2  after:")
print(frame2, "\n")

frame2  before:
        color  object  price
zero     blue    ball    1.2
one     green     pen    1.0
two    yellow  pencil    0.6
three     red   paper    0.9
four    white     mug    1.7 

frame2  after:
item    color  object  price
id                          
zero     blue    ball    1.2
one     green     pen    1.0
two    yellow  pencil    0.6
three     red   paper    0.9
four    white     mug    1.7 



In [189]:
# To add a new column- assigning a value to the instance of the DataFrame specifying a new column name.
# Adding a new column 'quantity' with values
frame2['quantity'] = [10, 20, 30, 40, 50]
print("frame2  after adding a new column 'quantity'")
print(frame2, "\n")


frame2  after adding a new column 'quantity'
item    color  object  price  quantity
id                                    
zero     blue    ball    1.2        10
one     green     pen    1.0        20
two    yellow  pencil    0.6        30
three     red   paper    0.9        40
four    white     mug    1.7        50 



In [195]:
# Adding a Column with a Scalar Value:
# Adding a new column 'discount' with the same value for all rows
print("frame2  after adding a new column 'discount with same value(0.1) for all the rows'")
frame2['discount'] = 0.1
print(frame2)

frame2  after adding a new column 'discount with same value(0.1) for all the rows'
item    color  object  price  quantity  discount
id                                              
zero     blue    ball    1.2        10       0.1
one     green     pen    1.0        20       0.1
two    yellow  pencil    0.6        30       0.1
three     red   paper    0.9        40       0.1
four    white     mug    1.7        50       0.1


In [197]:
# Adding a Column 'total' with Values Based on Existing Columns ( price and quantity):
frame2['total'] = frame2['price'] * frame2['quantity']
print(frame2)

item    color  object  price  quantity  discount  total
id                                                     
zero     blue    ball    1.2        10       0.1   12.0
one     green     pen    1.0        20       0.1   20.0
two    yellow  pencil    0.6        30       0.1   18.0
three     red   paper    0.9        40       0.1   36.0
four    white     mug    1.7        50       0.1   85.0


In [370]:
# create a series and add it as a column to the dataframe
#ser = pd.Series(np.arange(5)) # This will add a column with  all NAN because of index mismatch
ser = pd.Series(np.arange(5), index=['zero','one','two','three','four'])
print("The series is:")
print(ser)
frame2['newCol'] = ser 
print(frame2)

The series is:
zero     0
one      1
two      2
three    3
four     4
dtype: int32
        mug  pen  ball  newCol
blue      0    1     2     NaN
green     3    4     5     NaN
white     6    7     8     NaN
yellow    9   10    11     NaN


In [229]:
# creating DataFrame from numpy array
ind=['red','blue','yellow','white']
cols=['ball','pen','pencil','paper']
frame3 = pd.DataFrame(np.arange(16).reshape((4,4)),index=ind, columns=cols)
print(frame3)

        ball  pen  pencil  paper
red        0    1       2      3
blue       4    5       6      7
yellow     8    9      10     11
white     12   13      14     15


In [227]:
# Membership of a Value using isin()
# The isin() function in Pandas is used to filter a DataFrame by checking whether
# each element in a column (or multiple columns) is present in a given list of values.
# It returns a Boolean DataFrame or Series that can be used to filter rows.
print(frame2, "\n")
print("frame2.isin([0.6,'pen',1]):")
print(frame2.isin([0.6,'pen',1]))

# If you pass the value returned as a condition then you’ll get a new DataFrame 
# containing only the values that satisfy the condition.
print("frame2[frame2.isin([0.6,'pen',1])]:")
frame2[frame2.isin([0.6,'pen',1])]


item    color  object  price  newCol
id                                  
zero     blue    ball    1.2       0
one     green     pen    1.0       1
two    yellow  pencil    0.6       2
three     red   paper    0.9       3
four    white     mug    1.7       4 

item   color  object  price  newCol
id                                 
zero   False   False  False   False
one    False    True   True    True
two    False   False   True   False
three  False   False  False   False
four   False   False  False   False


item,color,object,price,newCol
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
zero,,,,
one,,pen,1.0,1.0
two,,,0.6,
three,,,,
four,,,,


In [231]:
# Deleting a column using del
del frame2['newCol']
print("frame2 after deleting a column newCol\n")
print(frame2, "\n")

frame2 after deleting a column newCol

item    color  object  price
id                          
zero     blue    ball    1.2
one     green     pen    1.0
two    yellow  pencil    0.6
three     red   paper    0.9
four    white     mug    1.7 



In [239]:
# Filtering: apply the filtering through the application of certain conditions
frame=pd.DataFrame(np.arange(16).reshape(4,4), columns=['c0','c1','c2','c3'])
print(frame, "\n")

frame[frame < 12]

   c0  c1  c2  c3
0   0   1   2   3
1   4   5   6   7
2   8   9  10  11
3  12  13  14  15 



Unnamed: 0,c0,c1,c2,c3
0,0.0,1.0,2.0,3.0
1,4.0,5.0,6.0,7.0
2,8.0,9.0,10.0,11.0
3,,,,


In [293]:
# DataFrame from Nested dict
# what if not all fields find a successful match,
# pandas will compensate for this inconsistency by adding the value NaN values missing. 
# consider external keys as column names and internal keys as labels for the indexes.
nestdict = {'red':{2012: 22, 2013: 33, 2014:44},'white':{2011: 13, 2012: 22, 2013: 16},'blue': {2011: 17, 2012: 27, 2013: 18}}
fr = pd.DataFrame(nestdict)
fr 

Unnamed: 0,red,white,blue
2012,22.0,22.0,27.0
2013,33.0,16.0,18.0
2014,44.0,,
2011,,13.0,17.0


In [257]:
# Transposition of a DataFrame (the columns become rows and rows become columns)
fr.T

Unnamed: 0,2012,2013,2014,2011
red,22.0,33.0,44.0,
white,22.0,16.0,,13.0
blue,27.0,18.0,,17.0


In [261]:
# The Index Objects: presence of an Index object totally integrated within  Series and the data frame data structures.
# Unlike all other elements within pandas data structures (Series and data frame), the Index objects are immutable objects. 
# Once declared, these cannot be changed. This ensures their secure sharing between the various data structures.
ser = pd.Series([5,0,3,8,4], index=['red','blue','yellow','white','green'])
print(ser.index)
# two functions that return, respectively, the index with the lowest value and more.
print(ser.idxmin()) # blue
print(ser.idxmax()) # white

Index(['red', 'blue', 'yellow', 'white', 'green'], dtype='object')
blue
white


In [295]:
# Index with Duplicate Labels
serd = pd.Series(range(6), index=['white','white','blue','green','green','yellow'])
print(serd)
print(serd['white'])
print("whether serd.index.is_unique is unique? found using serd.index.is_unique",serd.index.is_unique)  # False
frame.index.is_unique # True

white     0
white     1
blue      2
green     3
green     4
yellow    5
dtype: int64
white    0
white    1
dtype: int64
whether serd.index.is_unique is unique? found using serd.index.is_unique False


True

In [279]:
# Other Functionalities on Indexes ( Reindexing)
ser = pd.Series([2,5,7,4], index=['one','two','three','four'])
print(ser, "\n")
print(ser.reindex(['three','four','five','one']), "\n")
print(ser)

one      2
two      5
three    7
four     4
dtype: int64 

three    7.0
four     4.0
five     NaN
one      2.0
dtype: float64 

one      2
two      5
three    7
four     4
dtype: int64


In [35]:
# From a List of Dictionaries

# List of dictionaries
data_list_dict = [
    {'ID': 1, 'Name': 'Alice', 'Age': 24},
    {'ID': 2, 'Name': 'Bob', 'Age': 27},
    {'ID': 3, 'Name': 'Chris', 'Age': 22}
]

# Create DataFrame
df_list_dict = pd.DataFrame(data_list_dict)
print(df_list_dict)



   ID   Name  Age
0   1  Alice   24
1   2    Bob   27
2   3  Chris   22


In [337]:
# dropping: Another operation that is connected to Index objects is dropping
ser = pd.Series(np.arange(4.), index=['red','blue','yellow','white'])
ser

red       0.0
blue      1.0
yellow    2.0
white     3.0
dtype: float64

In [339]:
ser.drop('yellow')

red      0.0
blue     1.0
white    3.0
dtype: float64

In [341]:
print("The series is")
print(ser,"\n")
print("After dropping blue and white")
ser.drop(['blue','white'])

The series is
red       0.0
blue      1.0
yellow    2.0
white     3.0
dtype: float64 

After dropping blue and white


red       0.0
yellow    2.0
dtype: float64

In [13]:
# use drop() on dataframe
import numpy as np
frame = pd.DataFrame(np.arange(16).reshape((4,4)),index=['red','blue','yellow','white'], columns=['ball','pen','pencil','paper'])
print(frame)
print("\nframe after dropping 2 rows blue and yellow using frame.drop(['blue','yellow'])")
print(frame.drop(['blue','yellow']))

# To delete columns, you always need to specify the indexes of the columns, but you must specify the
# axis from which to delete the elements

print("\nframe after dropping 2 columns pen and pencil using frame.drop(['pen','pencil'],axis=1)")
frame.drop(['pen','pencil'],axis=1) # So to refer to the column names you should specify axis = 1.


        ball  pen  pencil  paper
red        0    1       2      3
blue       4    5       6      7
yellow     8    9      10     11
white     12   13      14     15

frame after dropping 2 rows blue and yellow using frame.drop(['blue','yellow'])
       ball  pen  pencil  paper
red       0    1       2      3
white    12   13      14     15

frame after dropping 2 columns pen and pencil using frame.drop(['pen','pencil'],axis=1)


Unnamed: 0,ball,paper
red,0,3
blue,4,7
yellow,8,11
white,12,15


In [321]:
# Arithmetic and Data Alignment
# some labels are present in both, while other labels are present only in one of the two.
# When the labels are present in both operators, their values will be added, while in the opposite case,
# they will also be shown in the result (new series), but with the value NaN.
s1 = pd.Series([3,2,5,1],['white','yellow','green','blue'])
s2 = pd.Series([1,4,7,2,1],['white','yellow','black','blue','brown'])
s1 + s2

black     NaN
blue      3.0
brown     NaN
green     NaN
white     4.0
yellow    6.0
dtype: float64

In [327]:
frame1 = pd.DataFrame(np.arange(16).reshape((4,4)),index=['red','blue','yellow','white'],columns=['ball','pen','pencil','paper'])
frame2 = pd.DataFrame(np.arange(12).reshape((4,3)), index=['blue','green','white','yellow'],columns=['mug','pen','ball'])
print(frame1)
print(frame2)
frame1 + frame2

        ball  pen  pencil  paper
red        0    1       2      3
blue       4    5       6      7
yellow     8    9      10     11
white     12   13      14     15
        mug  pen  ball
blue      0    1     2
green     3    4     5
white     6    7     8
yellow    9   10    11


Unnamed: 0,ball,mug,paper,pen,pencil
blue,6.0,,,6.0,
green,,,,,
red,,,,,
white,20.0,,,20.0,
yellow,19.0,,,19.0,


In [329]:
# Operations between Data Structures, Flexible Arithmetic Methods
frame1.add(frame2)

Unnamed: 0,ball,mug,paper,pen,pencil
blue,6.0,,,6.0,
green,,,,,
red,,,,,
white,20.0,,,20.0,
yellow,19.0,,,19.0,


In [11]:
# Operations between DataFrame and Series
import pandas as pd
import numpy as np
frame = pd.DataFrame(np.arange(16).reshape((4,4)),index=['red','blue','yellow','white'],columns=['ball','pen','pencil','paper'])
print(frame)
print("\n")
ser = pd.Series(np.arange(4), index=['ball','pen','pencil','paper'])
print(ser)
# the elements of the series are subtracted from the values of the data frame corresponding to the same index on the column
# The value is subtracted for all values of the column,regardless of their index.
frame - ser

        ball  pen  pencil  paper
red        0    1       2      3
blue       4    5       6      7
yellow     8    9      10     11
white     12   13      14     15


ball      0
pen       1
pencil    2
paper     3
dtype: int32


Unnamed: 0,ball,pen,pencil,paper
red,0,0,0,0
blue,4,4,4,4
yellow,8,8,8,8
white,12,12,12,12


In [360]:
ser['mug'] = 9
ser

ball      0
pen       1
pencil    2
paper     3
mug       9
dtype: int32

In [364]:
frame - ser

Unnamed: 0,ball,mug,paper,pen,pencil
red,0,,0,0,0
blue,4,,4,4,4
yellow,8,,8,8,8
white,12,,12,12,12


In [374]:
# Function Application and Mapping
print(frame)
np.sqrt(frame)

        ball  pen  pencil  paper
red        0    1       2      3
blue       4    5       6      7
yellow     8    9      10     11
white     12   13      14     15


Unnamed: 0,ball,pen,pencil,paper
red,0.0,1.0,1.414214,1.732051
blue,2.0,2.236068,2.44949,2.645751
yellow,2.828427,3.0,3.162278,3.316625
white,3.464102,3.605551,3.741657,3.872983


In [31]:
# Functions by Row or Column
#define a lambda function that calculates the range covered by the elements in an array
frame=pd.DataFrame(np.array([[63,7,8,6],[6,17,45,2],[53,1,92,3],[66,23,92,37]]), index=['red','blue','yellow', 'white'], columns=['ball','pen','pencil','paper'])
print(frame)
#f = lambda x: x.max() - x.min()

def f(x):
    return x.max() - x.min()

# Using the apply( ) function you can apply the function just defined on the DataFrame.
print("\napply() function applied on columns using print(frame.apply(f))");
print(frame.apply(f))
# The result, however, this time it is only one value for the column, but if you prefer to apply the function
# by row instead of by column, you have to specify the axis option set to 1.
print("\napply() function applied on rows using print(frame.apply(f,axis=1))");
frame.apply(f, axis=1)


        ball  pen  pencil  paper
red       63    7       8      6
blue       6   17      45      2
yellow    53    1      92      3
white     66   23      92     37

apply() function applied on columns using print(frame.apply(f))
ball      60
pen       22
pencil    84
paper     35
dtype: int64

apply() function applied on rows using print(frame.apply(f,axis=1))


red       57
blue      43
yellow    91
white     69
dtype: int64

In [25]:
# apply( )  can also return a Series (applied on column
def f(x):
    return pd.Series([x.min(), x.max()], index=['min','max'])
frame.apply(f)

Unnamed: 0,ball,pen,pencil,paper
min,0,1,2,3
max,12,13,14,15


In [400]:
print(frame,"\n")
print("Sum of columns:")
print(frame.sum(),"\n")
print("Mean of columns:")
print(frame.mean(),"\n")
frame.describe( ) # allows to obtain a summary statistics at once.

        ball  pen  pencil  paper
red        0    1       2      3
blue       4    5       6      7
yellow     8    9      10     11
white     12   13      14     15 

Sum of columns:
ball      24
pen       28
pencil    32
paper     36
dtype: int64 

Mean of columns:
ball      6.0
pen       7.0
pencil    8.0
paper     9.0
dtype: float64 



Unnamed: 0,ball,pen,pencil,paper
count,4.0,4.0,4.0,4.0
mean,6.0,7.0,8.0,9.0
std,5.163978,5.163978,5.163978,5.163978
min,0.0,1.0,2.0,3.0
25%,3.0,4.0,5.0,6.0
50%,6.0,7.0,8.0,9.0
75%,9.0,10.0,11.0,12.0
max,12.0,13.0,14.0,15.0


In [33]:
import pandas as pd

# Example DataFrame
df = pd.DataFrame({'ball': [0, 4, 8, 12]})

# Using different interpolation methods to see the effects
print(df['ball'].quantile(0.25, interpolation='linear'))
print(df['ball'].quantile(0.75, interpolation='linear'))


3.0
9.0


In [17]:
frame = pd.DataFrame(np.array([[11,22,33,44],[3,8,2,9],[1,2,3,4],[1,45,21,49]]),index=['red','blue','yellow','white'], columns=['pen','pencil','paper','ball'])
print("frame")
print(frame,"\n")
print("frame after frame.sort_index():, sorts inde")
print(frame.sort_index(),"\n")
print("frame after frame.sort_index() with axis=1")
frame.sort_index(axis=1)


frame
        pen  pencil  paper  ball
red      11      22     33    44
blue      3       8      2     9
yellow    1       2      3     4
white     1      45     21    49 

frame after frame.sort_index():, sorts index
        pen  pencil  paper  ball
blue      3       8      2     9
red      11      22     33    44
white     1      45     21    49
yellow    1       2      3     4 

frame after frame.sort_index() with axis=1


Unnamed: 0,ball,paper,pen,pencil
red,44,33,11,22
blue,9,2,3,8
yellow,4,3,1,2
white,49,21,1,45


In [40]:
#If you need to order the values in a DataFrame, you will use the sort_index( ) function seen previously but with the by option.
import pandas as pd

# Example DataFrame
frame = pd.DataFrame({
    'ball': [0, 4, 8, 12],
    'pen': [5, 1, 13, 9],
    'pencil': [2, 6, 10, 14],
    'paper': [3, 7, 11, 15]
})
print(frame,"\n")

# Sort DataFrame by the 'pen' column
print("DataFrame after sort by the 'pen' column:")
sorted_frame = frame.sort_values(by='pen')

print(sorted_frame)


   ball  pen  pencil  paper
0     0    5       2      3
1     4    1       6      7
2     8   13      10     11
3    12    9      14     15 

DataFrame after sort by the 'pen' column:
   ball  pen  pencil  paper
1     4    1       6      7
0     0    5       2      3
3    12    9      14     15
2     8   13      10     11


In [42]:
#sorting will be based on two or more columns
import pandas as pd

import pandas as pd

# Create a DataFrame with some tied 'pen' values
frame = pd.DataFrame({
    'ball': [12, 0, 8, 4],
    'pen': [13, 13, 5, 5],
    'pencil': [14, 10, 6, 12],
    'paper': [15, 3, 11, 7]
})

print(frame)
print(frame.sort_values(by='pen'),"\n")
# Primary Sort Key: 'pen' (Sort rows first based on this column)
frame.sort_values(by=['pen','pencil'])

   ball  pen  pencil  paper
0    12   13      14     15
1     0   13      10      3
2     8    5       6     11
3     4    5      12      7
   ball  pen  pencil  paper
2     8    5       6     11
3     4    5      12      7
0    12   13      14     15
1     0   13      10      3 



Unnamed: 0,ball,pen,pencil,paper
2,8,5,6,11
3,4,5,12,7
1,0,13,10,3
0,12,13,14,15


In [44]:
# Pandas can directly create a DataFrame from a CSV file using read_csv() method.

# Assuming 'data.csv' is in the current directory with the appropriate columns
df_csv = pd.read_csv('data.csv')
print(df_csv)

   id  value1  value2  value3
0   1     123     1.4      23
1   2     110     0.5      18
2   3     164     2.1      19


In [46]:
# Ranking
ser = pd.Series([5,0,3,8,4], index=['red','blue','yellow','white','green'])
print(ser,"\n")
print("series after ser.sort_values()")
# ser.order() deprecated
print(ser.sort_values(),"\n")
print("series after ser.rank() ")
print(ser.rank())
print("series after ser.rank(method='first') ")
ser.rank(method='first')


red       5
blue      0
yellow    3
white     8
green     4
dtype: int64 

series after ser.sort_values()
blue      0
yellow    3
green     4
red       5
white     8
dtype: int64 

series after ser.rank() 
red       4.0
blue      1.0
yellow    2.0
white     5.0
green     3.0
dtype: float64
series after ser.rank(method='first') 


red       4.0
blue      1.0
yellow    2.0
white     5.0
green     3.0
dtype: float64

In [90]:
import pandas as pd
# When you call the rank() method without specifying a method, Pandas defaults to method='average'
ser = pd.Series([7, 5, 5, 8]) # the order of 5 is 1 and next 5 is 2  , the avg is (1+2)/2=1.5
print(ser,"\n")
print("series after ser.rank()\n")
ranks = ser.rank()
print(ranks)
# Ties are broken based on the order in which they appear in the data.
print("series after ser.rank(method='first') \n")
print(ser.rank(method='first'),"\n")

# By default, even the ranking follows an ascending sort. To reverse this criterion, set the ascending option to False.
print(" Series after ser.rank(ascending=False)")
print(ser.rank(ascending=False))
print(" Series after ser.rank(ascending=False, method='first')")
print(ser.rank(ascending=False,method='first'))


0    7
1    5
2    5
3    8
dtype: int64 

series after ser.rank()

0    3.0
1    1.5
2    1.5
3    4.0
dtype: float64
series after ser.rank(method='first') 

0    3.0
1    1.0
2    2.0
3    4.0
dtype: float64 

 Series after ser.rank(ascending=False)
0    2.0
1    3.5
2    3.5
3    1.0
dtype: float64
 Series after ser.rank(ascending=False, method='first')
0    2.0
1    3.0
2    4.0
3    1.0
dtype: float64


In [100]:
#  Correlation and Covariance
# Correlation values range between -1 and +1:
# Correlation: indicates how two variables behave together—that is, whether they tend to increase together, decrease together, or show no pattern 
seq2 = pd.Series([3,4,3,4,5,4,3,2],['2006','2007','2008','2009','2010','2011','2012','2013'])
seq = pd.Series([1,2,3,4,4,3,2,1],['2006','2007','2008','2009','2010','2011','2012','2013'])
print("Correlation: ",seq.corr(seq2)) 

print("Covariance:  ",seq.cov(seq2)  )

Correlation:  0.7745966692414835
Covariance:   0.8571428571428571


In [112]:
# covariance and correlation are applied to a single DataFrame
import pandas as pd
frame2 = pd.DataFrame([[1,4,3,6],[4,5,6,1],[3,3,1,5],[4,1,6,4]], index=['red','blue','yellow','white'],columns=['ball','pen','pencil','paper'])
print(frame2)
print(frame2.corr())
frame2.cov()


        ball  pen  pencil  paper
red        1    4       3      6
blue       4    5       6      1
yellow     3    3       1      5
white      4    1       6      4
            ball       pen    pencil     paper
ball    1.000000 -0.276026  0.577350 -0.763763
pen    -0.276026  1.000000 -0.079682 -0.361403
pencil  0.577350 -0.079682  1.000000 -0.692935
paper  -0.763763 -0.361403 -0.692935  1.000000


Unnamed: 0,ball,pen,pencil,paper
ball,2.0,-0.666667,2.0,-2.333333
pen,-0.666667,2.916667,-0.333333,-1.333333
pencil,2.0,-0.333333,6.0,-3.666667
paper,-2.333333,-1.333333,-3.666667,4.666667


In [90]:
# Hierarchical Indexing and Leveling

mser = pd.Series(np.random.rand(8), index=[['white','white','white','blue','blue','red','red','red'],
['up','down','right','up','down','up','down','left']])
print(mser,"\n")

# Accessing the value at (white, up)

value = mser['white', 'up']
print("Accessing the value at (white, up) using mser['white', 'up']",value,"\n")

# Slicing all 'white' entries
white_values = mser['white']

# Cross-section access at a lower level across multiple upper levels
print("fetch all 'down' entries across 'white', 'blue', 'red' using mser.loc[:, 'down']")
down_values = mser.loc[:, 'down']  # This will fetch all 'down' entries across 'white', 'blue', 'red'
print("\nmser.index")
print(mser.index,"\n")
print("\nmser.index.levels")
print(mser.index.levels,"\n")
print("\nmser.index.codes: This gives the integer codes that represent the position of each label in the corresponding level")
print(mser.index.codes,"\n")
print("mser.index.get_level_values(0)")
print(mser.index.get_level_values(0))

white  up       0.311611
       down     0.929862
       right    0.365807
blue   up       0.750327
       down     0.256312
red    up       0.812639
       down     0.901609
       left     0.615956
dtype: float64 

Accessing the value at (white, up) using mser['white', 'up'] 0.31161070027170745 

fetch all 'down' entries across 'white', 'blue', 'red' using mser.loc[:, 'down']

mser.index
MultiIndex([('white',    'up'),
            ('white',  'down'),
            ('white', 'right'),
            ( 'blue',    'up'),
            ( 'blue',  'down'),
            (  'red',    'up'),
            (  'red',  'down'),
            (  'red',  'left')],
           ) 


mser.index.levels
[['blue', 'red', 'white'], ['down', 'left', 'right', 'up']] 


mser.index.codes: This gives the integer codes that represent the position of each label in the corresponding level
[[2, 2, 2, 0, 0, 1, 1, 1], [3, 0, 2, 3, 0, 3, 0, 1]] 

mser.index.get_level_values(0)
Index(['white', 'white', 'white', 'blue', 'blue', '

In [92]:
# It is possible to define a hierarchical index both for the rows and for the columns
mframe = pd.DataFrame(np.random.randn(16).reshape(4,4),index=[['white','white','red','red'], ['up','down','up','down']],
columns=[['pen','pen','paper','paper'],[1,2,1,2]])
mframe 

Unnamed: 0_level_0,Unnamed: 1_level_0,pen,pen,paper,paper
Unnamed: 0_level_1,Unnamed: 1_level_1,1,2,1,2
white,up,0.7306,0.867602,-0.243466,1.684347
white,down,1.103726,-0.989485,1.021561,-0.161414
red,up,0.980246,-1.005305,1.51169,1.271966
red,down,-2.05068,-0.641727,0.052567,3.095284


In [None]:
mframe.swaplevel('colors','status')

In [94]:
# Reordering and Sorting Levels: The swaplevel( ) function accepts as argument the names assigned to the two levels that you want to
# interchange, and returns a new object with the two levels interchanged between them, while leaving the data unmodified.
mframe.columns.names = ['objects','id']
mframe.index.names = ['colors','status']
mframe

Unnamed: 0_level_0,objects,pen,pen,paper,paper
Unnamed: 0_level_1,id,1,2,1,2
colors,status,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
white,up,0.7306,0.867602,-0.243466,1.684347
white,down,1.103726,-0.989485,1.021561,-0.161414
red,up,0.980246,-1.005305,1.51169,1.271966
red,down,-2.05068,-0.641727,0.052567,3.095284


In [54]:
# series.unstack():converts the Series with hierarchical index in a simple DataFrame,
#where the second set of indexes is converted into a new set of columns.

mser = pd.Series(np.random.rand(8), index=[['white','white','white','blue','blue','red','red','red'],['up','down','right','up','down','up','down','left']])
print(mser)
print("Output of mser.unstack(): converts the Series with hierarchical index in a simple DataFrame")
print(mser.unstack())


white  up       0.299859
       down     0.410532
       right    0.204315
blue   up       0.103503
       down     0.902083
red    up       0.521199
       down     0.125277
       left     0.181494
dtype: float64
Output of mser.unstack(): converts the Series with hierarchical index in a simple DataFrame
           down      left     right        up
blue   0.902083       NaN       NaN  0.103503
red    0.125277  0.181494       NaN  0.521199
white  0.410532       NaN  0.204315  0.299859


In [58]:
# Convert a DataFrame in to a  Series

# Create a DataFrame with some tied 'pen' values

frame = pd.DataFrame(np.arange(16).reshape((4,4)), index=['red','blue','yellow','white'],columns=['ball','pen','pencil','paper'])

print(frame)
frame.stack()   

        ball  pen  pencil  paper
red        0    1       2      3
blue       4    5       6      7
yellow     8    9      10     11
white     12   13      14     15


red     ball       0
        pen        1
        pencil     2
        paper      3
blue    ball       4
        pen        5
        pencil     6
        paper      7
yellow  ball       8
        pen        9
        pencil    10
        paper     11
white   ball      12
        pen       13
        pencil    14
        paper     15
dtype: int32

In [106]:
#Reordering and Sorting Levels
# The swaplevel( ) function accepts as argument the names assigned to the two levels that you want to
# interchange, and returns a new object with the two levels interchanged between them, while leaving the data unmodified.

mframe = pd.DataFrame(np.random.randn(16).reshape(4,4),  index=[['white','white','red','red'], ['up','down','up','down']],
columns=[['pen','pen','paper','paper'],[1,2,1,2]])
print(mframe)
mframe.columns.names = ['objects','id']
mframe.index.names = ['colors','status']
print(mframe)
print(mframe.swaplevel('colors','status'))   
# mframe.sortlevel('colors') #deprecated


                 pen               paper          
                   1         2         1         2
white up   -0.333599 -0.912782  0.686864  0.525352
      down -0.154492 -0.457602 -2.057285  0.178246
red   up   -1.383538  0.625260  0.288143 -0.001251
      down  0.815578 -0.538156  0.566645 -0.776186
objects             pen               paper          
id                    1         2         1         2
colors status                                        
white  up     -0.333599 -0.912782  0.686864  0.525352
       down   -0.154492 -0.457602 -2.057285  0.178246
red    up     -1.383538  0.625260  0.288143 -0.001251
       down    0.815578 -0.538156  0.566645 -0.776186
objects             pen               paper          
id                    1         2         1         2
status colors                                        
up     white  -0.333599 -0.912782  0.686864  0.525352
down   white  -0.154492 -0.457602 -2.057285  0.178246
up     red    -1.383538  0.625260  0.288143 -0

In [108]:
import pandas as pd
import numpy as np

# Sample DataFrame with a multi-level index
mframe = pd.DataFrame({
    "data": np.random.randn(6),
    "colors": ["red", "blue", "red", "blue", "red", "blue"]
}).set_index(['colors', 'data'])

# Sort by 'colors' index level
sorted_frame = mframe.sort_index(level='colors')
print(sorted_frame)


Empty DataFrame
Columns: []
Index: [(blue, 0.13932947077929486), (blue, 0.48690683471388274), (blue, 0.5685013474409936), (red, -0.1617454346588351), (red, 1.1089210640450908), (red, 1.344113513885896)]


In [140]:
# Using the method corrwith( ), you can calculate the pairwise correlations between the columns or rows
# of a data frame with a Series or another DataFrame( ).
ser = pd.Series([7, 5, 5, 8]) 
print(ser,"\n")
frame2.corrwith(ser)

0    7
1    5
2    5
3    8
dtype: int64 



ball     NaN
pen      NaN
pencil   NaN
paper    NaN
dtype: float64

In [104]:
import pandas as pd

# Example data
data = {
    'seq': [1, 2, 3, 4, 5],
    'seq2': [2, 4, 6, 8, 10]
}

df = pd.DataFrame(data)

# Calculate correlation and covariance
correlation = df['seq'].corr(df['seq2'])
covariance = df['seq'].cov(df['seq2'])

print("Correlation: ", correlation) # correlation would be 1, because seq2 is a perfect linear transformation of seq (double the values of seq)
print("Covariance:  ", covariance)
# The covariance would be positive and relatively large, reflecting the strong positive linear relationship between seq and seq2, 
# but the exact value will depend on the scaling of the variables and the sample size.


Correlation:  0.9999999999999999
Covariance:   5.0


In [281]:
# Using Random Data
import numpy as np # np is a abbreviation for NumPy library

# Random data
np.random.seed(0) # random refers to a submodule of NumPy (numpy.random) that deals with random number generation. 

# Setting a seed ensures that the sequence of random numbers generated by NumPy is reproducible
# every time you run this code, you will get the same matrix of random numbers. 
# debugging and for scenarios where consistent output is required, such as in automated tests or scientific experiments.
random_data = np.random.randn(5, 3)  # 5 rows, 3 columns

df_random = pd.DataFrame(random_data, columns=['Column1', 'Column2', 'Column3'])

print(df_random)


    Column1   Column2   Column3
0  1.764052  0.400157  0.978738
1  2.240893  1.867558 -0.977278
2  0.950088 -0.151357 -0.103219
3  0.410599  0.144044  1.454274
4  0.761038  0.121675  0.443863


In [174]:
import numpy as np

np.random.seed(0)  # Set the seed for reproducibility
random_numbers = np.random.rand(5)  # Generate 5 random numbers from a uniform distribution

print(random_numbers)


[0.5488135  0.71518937 0.60276338 0.54488318 0.4236548 ]


In [208]:
# Selecting elements
# Using [] (Bracket Notation) to access a column
import pandas as pd

# Create a DataFrame
data = {'Name': ['Alice', 'Bob', 'Chris'],
        'Age': [25, 26, 27],
        'City': ['New York', 'Los Angeles', 'Chicago']
       }
df = pd.DataFrame(data)
print(df)

print("values:______ \n",df.values)
print("Columns:______ \n",df.columns)
print("Index:_______ \n",df.index)
print("_________Access a column_____________________")
# Access the 'Name' column
print(df['City'])

    Name  Age         City
0  Alice   25     New York
1    Bob   26  Los Angeles
2  Chris   27      Chicago
values:______ 
 [['Alice' 25 'New York']
 ['Bob' 26 'Los Angeles']
 ['Chris' 27 'Chicago']]
Columns:______ 
 Index(['Name', 'Age', 'City'], dtype='object')
Index:_______ 
 RangeIndex(start=0, stop=3, step=1)
_________Access a column_____________________
0       New York
1    Los Angeles
2        Chicago
Name: City, dtype: object


In [None]:
# Since .ix[] has been removed, you should use .loc[] or .iloc[] depending on your needs:

# .loc[]: Use this for label-based indexing. It’s safe and predictable because it works with labels in the index.
# .iloc[]: This is suitable for integer-based indexing. It allows you to access elements by their integer position, 
#          making it more straightforward when dealing with positions rather than labels.

In [196]:
# Using .loc[] (Label-based): This method allows you to access a group of rows and columns by labels or a boolean array.
print("____Access a row_____________________________")
# Access the row for Bob
print(df.loc[1])
print("____Access a single value ___________________")
# Access the 'Age' of Chris using label
print(df.loc[2, 'Age'])
print("______ multiple rows and specify columns_____")
# 
print(df.loc[0:2, ['Name', 'City']])


____Access a row_____________________________
Name            Bob
Age              26
City    Los Angeles
Name: 1, dtype: object
____Access a single value ___________________
27
______ multiple rows and specify columns_____
    Name         City
0  Alice     New York
1    Bob  Los Angeles
2  Chris      Chicago


In [230]:
# Using .iloc[] (Integer position-based)

#      Name  Age         City
#   0  Alice   25     New York
#   1    Bob   26  Los Angeles
#   2  Chris   27      Chicago

# Access the second row
print(df.iloc[1])
print("_________________________________________________________________ \n")
# Access the element at the second row and second column (Age of Bob)
print(df.iloc[1, 1])
print("_________________________________________________________________ \n")
# Access a range of rows and columns
print(df.iloc[0:2, 0:2])


Name            Bob
Age              26
City    Los Angeles
Name: 1, dtype: object
_________________________________________________________________ 

26
_________________________________________________________________ 

    Name  Age
0  Alice   25
1    Bob   26


In [226]:
# Using Conditional Selection

# Select rows where Age is greater than 25
print(df[df['Age'] > 25])
print("_________________________________________________________________ \n")
# Multiple conditions
print(df[(df['Age'] > 25) & (df['City'] == 'Los Angeles')])


    Name  Age         City
1    Bob   26  Los Angeles
2  Chris   27      Chicago
_________________________________________________________________ 

  Name  Age         City
1  Bob   26  Los Angeles


In [232]:
# Using .at[] and .iat[] for Fast Access to a Single Element


# Using `.at[]` - Access the 'City' of Alice
print(df.at[0, 'City'])

# Using `.iat[]` - Access the 'City' of Alice using integer indices
print(df.iat[0, 2])

New York
New York


In [236]:
df.index.name = 'id'; 
df.columns.name = 'item'
df

item,Name,Age,City
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,Alice,25,New York
1,Bob,26,Los Angeles
2,Chris,27,Chicago


In [243]:
import pandas as pd

# Create a DataFrame
df = pd.DataFrame({
    'Name': ['Alice', 'Bob', 'Chris'],
    'Age': [25, 26, 27]
})

# Assign a single value to a new column
df['Status'] = 'Active'
print(df)


    Name  Age  Status
0  Alice   25  Active
1    Bob   26  Active
2  Chris   27  Active


In [245]:
# Assigning Values Using loc

# Set 'Age' of Bob to 30
df.loc[df['Name'] == 'Bob', 'Age'] = 30

# Set multiple values conditionally
df.loc[df['Age'] > 25, 'Status'] = 'Senior'
print(df)


    Name  Age  Status
0  Alice   25  Active
1    Bob   30  Senior
2  Chris   27  Senior


In [247]:
# Assigning Values Using iloc ( integer based assignment)

# Set the value of the first row, second column (Age of Alice)
df.iloc[0, 1] = 24
print(df)


    Name  Age  Status
0  Alice   24  Active
1    Bob   30  Senior
2  Chris   27  Senior


In [249]:
# Adding a New Column Based on Existing Columns

df['Seniority'] = df['Age'] - 20
print(df)


    Name  Age  Status  Seniority
0  Alice   24  Active          4
1    Bob   30  Senior         10
2  Chris   27  Senior          7


In [251]:
# Using assign Method

# Create or modify columns using 'assign'
new_df = df.assign(
    Age_Plus_Five = df['Age'] + 5,
    Senior_Status = lambda x: x['Age'] > 28
)
print(new_df)


    Name  Age  Status  Seniority  Age_Plus_Five  Senior_Status
0  Alice   24  Active          4             29          False
1    Bob   30  Senior         10             35           True
2  Chris   27  Senior          7             32          False


In [253]:
# Setting Values with Conditions
import numpy as np
# Set 'Age' based on a condition
df['Age'] = np.where(df['Age'] > 26, 30, df['Age'])
print(df)


    Name  Age  Status  Seniority
0  Alice   24  Active          4
1    Bob   30  Senior         10
2  Chris   30  Senior          7


In [263]:
# Applying a Function to Modify a Column

# Update 'Name' to uppercase
df['Name'] = df['Name'].apply(lambda x: x.upper()) 
print(df['Name'])
print(type(df['Name']))
print(df)

# A lambda function is a small anonymous function defined with the keyword lambda.
# The syntax of a lambda function is lambda arguments: expression.
# It can take any number of arguments but only supports a single expression.
# In the above ex, takes 1 argument( x) and the applies expression (x.upper()) to it. 
# here x is an element of the DataFrame
# Pandas iterates over each element of the 'Name' Series.
# For each element (name), the lambda function is called with this element as x.
# The x.upper() method transforms the name to uppercase.
# The .apply() method collects all these transformed names into a new Series that replaces the original 'Name' column.

0    ALICE
1      BOB
2    CHRIS
Name: Name, dtype: object
<class 'pandas.core.series.Series'>
    Name  Age  Status  Seniority
0  ALICE   24  Active          4
1    BOB   30  Senior         10
2  CHRIS   30  Senior          7


In [5]:
import pandas as pd
import numpy as np

# Creating a multi-level indexed DataFrame
np.random.seed(42)  # For reproducibility
df = pd.DataFrame(np.random.rand(6, 2),
                  index=[['green', 'green', 'blue', 'blue', 'red', 'red'], 
                         ['top', 'bottom', 'top', 'bottom', 'top', 'bottom']],
                  columns=['A', 'B'])
print(df)
# print(df['A']['top'])
# print(df.loc[:, 'top']['A'])
# print(df.loc['top', 'A'])
print(df.loc[(slice(None), 'top'), 'A'])

                     A         B
green top     0.374540  0.950714
      bottom  0.731994  0.598658
blue  top     0.156019  0.155995
      bottom  0.058084  0.866176
red   top     0.601115  0.708073
      bottom  0.020584  0.969910
green  top    0.374540
blue   top    0.156019
red    top    0.601115
Name: A, dtype: float64
