# Key Data Structures in Pandas

1. Series
   - A one-dimensional labeled array capable of holding any data type.
   - Can be created from a list, dictionary, or array.

2. DataFrame
   - A two-dimensional labeled data structure with columns of potentially different types.
   - Can be thought of as a dictionary of Series.
   - Created from a variety of inputs, including lists, dictionaries, and external files (CSV, Excel, etc.).

In [1]:
import pandas as pd

In [2]:
s = pd.Series([1, 2, 3, 4, 5], index=['a', 'b', 'c', 'd', 'e'])

In [3]:
s

a    1
b    2
c    3
d    4
e    5
dtype: int64

In [4]:
df = pd.DataFrame({'names': ["harry", "sally", "john", "doe"], 'marks': [85, 90, 78, 88]})

In [5]:
df

Unnamed: 0,names,marks
0,harry,85
1,sally,90
2,john,78
3,doe,88


In [6]:
df = pd.read_csv('../data/iris.csv')

In [7]:
df

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety
0,5.1,3.5,1.4,0.2,Setosa
1,4.9,3.0,1.4,0.2,Setosa
2,4.7,3.2,1.3,0.2,Setosa
3,4.6,3.1,1.5,0.2,Setosa
4,5.0,3.6,1.4,0.2,Setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Virginica
146,6.3,2.5,5.0,1.9,Virginica
147,6.5,3.0,5.2,2.0,Virginica
148,6.2,3.4,5.4,2.3,Virginica


In [8]:
df.head()

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety
0,5.1,3.5,1.4,0.2,Setosa
1,4.9,3.0,1.4,0.2,Setosa
2,4.7,3.2,1.3,0.2,Setosa
3,4.6,3.1,1.5,0.2,Setosa
4,5.0,3.6,1.4,0.2,Setosa


In [9]:
df.tail()

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety
145,6.7,3.0,5.2,2.3,Virginica
146,6.3,2.5,5.0,1.9,Virginica
147,6.5,3.0,5.2,2.0,Virginica
148,6.2,3.4,5.4,2.3,Virginica
149,5.9,3.0,5.1,1.8,Virginica


In [10]:
df.describe()

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width
count,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333
std,0.828066,0.435866,1.765298,0.762238
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal.length  150 non-null    float64
 1   sepal.width   150 non-null    float64
 2   petal.length  150 non-null    float64
 3   petal.width   150 non-null    float64
 4   variety       150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB


# Data Selection

In [12]:
df[["sepal.length"]]

Unnamed: 0,sepal.length
0,5.1
1,4.9
2,4.7
3,4.6
4,5.0
...,...
145,6.7
146,6.3
147,6.5
148,6.2


In [13]:
df[["sepal.length", "sepal.width"]]

Unnamed: 0,sepal.length,sepal.width
0,5.1,3.5
1,4.9,3.0
2,4.7,3.2
3,4.6,3.1
4,5.0,3.6
...,...,...
145,6.7,3.0
146,6.3,2.5
147,6.5,3.0
148,6.2,3.4


In [14]:
df.iloc[1]

sepal.length       4.9
sepal.width        3.0
petal.length       1.4
petal.width        0.2
variety         Setosa
Name: 1, dtype: object

In [15]:
df2 = pd.read_csv('../data/data.csv')

In [16]:
df2

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety
0,5.1,3.5,1.4,0.2,Setosa
1,4.9,3.0,1.4,0.2,Setosa
2,4.7,3.2,1.3,0.2,Setosa
3,4.6,,1.5,0.2,Setosa
4,5.0,3.6,1.4,0.2,Setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Virginica
146,6.3,2.5,5.0,1.9,Virginica
147,6.5,3.0,5.2,2.0,Virginica
148,6.2,3.4,5.4,2.3,Virginica


In [17]:
df2.head()

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety
0,5.1,3.5,1.4,0.2,Setosa
1,4.9,3.0,1.4,0.2,Setosa
2,4.7,3.2,1.3,0.2,Setosa
3,4.6,,1.5,0.2,Setosa
4,5.0,3.6,1.4,0.2,Setosa


In [18]:
df2.dropna()

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety
0,5.1,3.5,1.4,0.2,Setosa
1,4.9,3.0,1.4,0.2,Setosa
2,4.7,3.2,1.3,0.2,Setosa
4,5.0,3.6,1.4,0.2,Setosa
7,5.0,3.4,1.5,0.2,Setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Virginica
146,6.3,2.5,5.0,1.9,Virginica
147,6.5,3.0,5.2,2.0,Virginica
148,6.2,3.4,5.4,2.3,Virginica


In [19]:
df2.fillna(0)

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety
0,5.1,3.5,1.4,0.2,Setosa
1,4.9,3.0,1.4,0.2,Setosa
2,4.7,3.2,1.3,0.2,Setosa
3,4.6,0.0,1.5,0.2,Setosa
4,5.0,3.6,1.4,0.2,Setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Virginica
146,6.3,2.5,5.0,1.9,Virginica
147,6.5,3.0,5.2,2.0,Virginica
148,6.2,3.4,5.4,2.3,Virginica


In [20]:
df2.rename(columns={'sepal.length': 'sl', 'sepal.width': 'sw'})

Unnamed: 0,sl,sw,petal.length,petal.width,variety
0,5.1,3.5,1.4,0.2,Setosa
1,4.9,3.0,1.4,0.2,Setosa
2,4.7,3.2,1.3,0.2,Setosa
3,4.6,,1.5,0.2,Setosa
4,5.0,3.6,1.4,0.2,Setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Virginica
146,6.3,2.5,5.0,1.9,Virginica
147,6.5,3.0,5.2,2.0,Virginica
148,6.2,3.4,5.4,2.3,Virginica


In [21]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal.length  146 non-null    float64
 1   sepal.width   147 non-null    float64
 2   petal.length  147 non-null    float64
 3   petal.width   150 non-null    float64
 4   variety       144 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB


In [22]:
df['sepal.length'] = df['sepal.length'].astype(int)

In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal.length  150 non-null    int64  
 1   sepal.width   150 non-null    float64
 2   petal.length  150 non-null    float64
 3   petal.width   150 non-null    float64
 4   variety       150 non-null    object 
dtypes: float64(3), int64(1), object(1)
memory usage: 6.0+ KB


In [24]:
df

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety
0,5,3.5,1.4,0.2,Setosa
1,4,3.0,1.4,0.2,Setosa
2,4,3.2,1.3,0.2,Setosa
3,4,3.1,1.5,0.2,Setosa
4,5,3.6,1.4,0.2,Setosa
...,...,...,...,...,...
145,6,3.0,5.2,2.3,Virginica
146,6,2.5,5.0,1.9,Virginica
147,6,3.0,5.2,2.0,Virginica
148,6,3.4,5.4,2.3,Virginica


In [25]:
df['zeroes'] = [i+1 for i in range(len(df))]

In [26]:
df

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety,zeroes
0,5,3.5,1.4,0.2,Setosa,1
1,4,3.0,1.4,0.2,Setosa,2
2,4,3.2,1.3,0.2,Setosa,3
3,4,3.1,1.5,0.2,Setosa,4
4,5,3.6,1.4,0.2,Setosa,5
...,...,...,...,...,...,...
145,6,3.0,5.2,2.3,Virginica,146
146,6,2.5,5.0,1.9,Virginica,147
147,6,3.0,5.2,2.0,Virginica,148
148,6,3.4,5.4,2.3,Virginica,149


In [27]:
def fx(a):
    return a + 1

df["zeroes + 1"] = df["zeroes"].apply(fx)
df

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety,zeroes,zeroes + 1
0,5,3.5,1.4,0.2,Setosa,1,2
1,4,3.0,1.4,0.2,Setosa,2,3
2,4,3.2,1.3,0.2,Setosa,3,4
3,4,3.1,1.5,0.2,Setosa,4,5
4,5,3.6,1.4,0.2,Setosa,5,6
...,...,...,...,...,...,...,...
145,6,3.0,5.2,2.3,Virginica,146,147
146,6,2.5,5.0,1.9,Virginica,147,148
147,6,3.0,5.2,2.0,Virginica,148,149
148,6,3.4,5.4,2.3,Virginica,149,150


In [28]:
df.to_csv('../data/export.csv', index=False)

In [29]:
df1 = pd.DataFrame({"name": ["Alice", "Bob", "Charlie", "Rohan", "Shubham", "Aparna", "Khushi"], "marks": [85, 90, 78, 88, 92, 95, 80]
                    })

df1

Unnamed: 0,name,marks
0,Alice,85
1,Bob,90
2,Charlie,78
3,Rohan,88
4,Shubham,92
5,Aparna,95
6,Khushi,80


In [30]:
df2 = pd.DataFrame({"name": ["Akash", "Akasha", "Alka", "Priyanka"], "marks": [88, 92, 79, 85]})

df2

Unnamed: 0,name,marks
0,Akash,88
1,Akasha,92
2,Alka,79
3,Priyanka,85


In [31]:
pd.concat([df1, df2], ignore_index=True)

Unnamed: 0,name,marks
0,Alice,85
1,Bob,90
2,Charlie,78
3,Rohan,88
4,Shubham,92
5,Aparna,95
6,Khushi,80
7,Akash,88
8,Akasha,92
9,Alka,79


In [32]:
df1 = pd.DataFrame({
    "name": ["Alka", "Priyanka"], 
    "marks": [88, 92]
})

In [33]:
df2 = pd.DataFrame({
    "name": ["Alka", "Priyanka", "Harry"],
    "rollNo": [266, 284, 299],
})

In [34]:
pd.merge(df1, df2, on='name')

Unnamed: 0,name,marks,rollNo
0,Alka,88,266
1,Priyanka,92,284
