# introduction to pandas 


In [2]:
import pandas as pd
data = {'Name': ['Alice', 'Bob', 'Charlie', 'David'],
        'Age': [25, 30, 35, 40],
        'City': ['New York', 'Los Angeles', 'Chicago', 'Houston']}
df = pd.DataFrame(data)


In [3]:
df

Unnamed: 0,Name,Age,City
0,Alice,25,New York
1,Bob,30,Los Angeles
2,Charlie,35,Chicago
3,David,40,Houston


In [4]:
# Display the first few rows of the DataFrame
print(df.head())

# Get the shape of the DataFrame
print(df.shape)

# Get summary statistics of the DataFrame
print(df.describe())

      Name  Age         City
0    Alice   25     New York
1      Bob   30  Los Angeles
2  Charlie   35      Chicago
3    David   40      Houston
(4, 3)
             Age
count   4.000000
mean   32.500000
std     6.454972
min    25.000000
25%    28.750000
50%    32.500000
75%    36.250000
max    40.000000


# pandas series

In [5]:
#pandas series

import pandas as pd
data = ['apple', 'banana', 'cherry', 'date']
series = pd.Series(data)
print(series)


0     apple
1    banana
2    cherry
3      date
dtype: object


In [6]:
series

0     apple
1    banana
2    cherry
3      date
dtype: object

In [7]:
#from numpy array 

import numpy as np
import pandas as pd
data = np.array(['a', 'b', 'c', 'd'])
series = pd.Series(data)
print(series)


0    a
1    b
2    c
3    d
dtype: object


In [8]:
#indexing 
print(series[1])  # Accessing the element at index 1


b


In [9]:
#slicing
print(series[1:3]) 

1    b
2    c
dtype: object


In [10]:
#attributes 

print(series.dtype)

object


In [11]:
print(series.index)

RangeIndex(start=0, stop=4, step=1)


In [12]:
#operations 
print(series.str.upper())  # Convert all elements to uppercase

0    A
1    B
2    C
3    D
dtype: object


In [13]:
#adding labes for index 

series = pd.Series(data, index=['w', 'x', 'y', 'z'])
print(series)

w    a
x    b
y    c
z    d
dtype: object


In [14]:
#handling missing data 

import pandas as pd
import numpy as np
data = {'a': 1, 'b': 2, 'c': np.nan, 'd': 4}
series = pd.Series(data)
print(series)

a    1.0
b    2.0
c    NaN
d    4.0
dtype: float64


In [15]:
#checking null values 
print(series.isnull())

a    False
b    False
c     True
d    False
dtype: bool


In [16]:
#addition 

series1 = pd.Series([1, 2, 3, 4])
series2 = pd.Series([10, 20, 30, 40])
print(series1 + series2)

0    11
1    22
2    33
3    44
dtype: int64


In [17]:
#multiplication 

print(series1 * series2)

0     10
1     40
2     90
3    160
dtype: int64


In [18]:
#sort by index
print(series.sort_index())

a    1.0
b    2.0
c    NaN
d    4.0
dtype: float64


In [20]:
#sort by values
print(series.sort_values())

a    1.0
b    2.0
d    4.0
c    NaN
dtype: float64


In [21]:
# apply functions 

print(series.apply(lambda x: x * 2))

a    2.0
b    4.0
c    NaN
d    8.0
dtype: float64


In [22]:
#merging or concatenation 

series1 = pd.Series([1, 2, 3])
series2 = pd.Series([4, 5, 6])
concatenated_series = pd.concat([series1, series2])
print(concatenated_series)

0    1
1    2
2    3
0    4
1    5
2    6
dtype: int64


In [23]:
replaced_series = series.replace({10: 100, 30: 300})
print(replaced_series)

a    1.0
b    2.0
c    NaN
d    4.0
dtype: float64


In [24]:
import pandas as pd

s = pd.Series([1, 2, 3, 4, 5])
print(s.shape)


(5,)


In [25]:
#dimension 

import pandas as pd

s = pd.Series([1, 2, 3, 4, 5])
print(s.ndim)  # Output: 1

df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
print(df.ndim)  # Output: 2


1
2


In [26]:
#missing data 
import pandas as pd

data = pd.Series([1, 2, None, 4, None, 6])
print(data.isnull())

0    False
1    False
2     True
3    False
4     True
5    False
dtype: bool


In [27]:
cleaned_data = data.dropna()
print(cleaned_data)

0    1.0
1    2.0
3    4.0
5    6.0
dtype: float64


In [28]:
filled_data = data.fillna(0)  # Fill NaNs with 0
print(filled_data)

0    1.0
1    2.0
2    0.0
3    4.0
4    0.0
5    6.0
dtype: float64


In [29]:
#duplicated values 

data = pd.Series([1, 2, 2, 3, 3, 4, 5, 5])
print(data.duplicated())

0    False
1    False
2     True
3    False
4     True
5    False
6    False
7     True
dtype: bool


In [30]:
cleaned_data = data.drop_duplicates()
print(cleaned_data)

0    1
1    2
3    3
5    4
6    5
dtype: int64


In [34]:
#Use ffill to fill missing values with the previous non-missing value (forward fill):
data=pd.Series([1,2,3,np.nan,5,6,np.nan,np.nan])
filled_data = data.ffill()
filled_data

0    1.0
1    2.0
2    3.0
3    3.0
4    5.0
5    6.0
6    6.0
7    6.0
dtype: float64

In [36]:
#Use ffill to fill missing values with the previous non-missing value (forward fill):
data=pd.Series([1,2,3,np.nan,5,6,np.nan,np.nan,8])
filled_data = data.bfill()
filled_data

0    1.0
1    2.0
2    3.0
3    5.0
4    5.0
5    6.0
6    8.0
7    8.0
8    8.0
dtype: float64

In [37]:
replaced_data = data.replace(2, 20)  # Replace all occurrences of 2 with 20
replaced_data

0     1.0
1    20.0
2     3.0
3     NaN
4     5.0
5     6.0
6     NaN
7     NaN
8     8.0
dtype: float64

In [41]:
cleaned_data = data.drop_duplicates()
cleaned_data

0    1.0
1    2.0
2    3.0
3    NaN
4    5.0
5    6.0
8    8.0
dtype: float64

In [42]:
cleaned_data = data.drop_duplicates(ignore_index=True)
cleaned_data

0    1.0
1    2.0
2    3.0
3    NaN
4    5.0
5    6.0
6    8.0
dtype: float64

In [44]:
missing_count = data.isnull().sum()
missing_count

3

In [45]:
duplicate_count = data.duplicated().sum()
duplicate_count

2

In [46]:
mean_value = data.mean()
median_value = data.median()
mode_value = data.mode().iloc[0]

filled_data_mean = data.fillna(mean_value)
filled_data_median = data.fillna(median_value)
filled_data_mode = data.fillna(mode_value)

In [47]:
cleaned_data_first = data.drop_duplicates(keep='first')  # Keep the first occurrence
cleaned_data_last = data.drop_duplicates(keep='last')    # Keep the last occurrence
cleaned_data_all = data.drop_duplicates(keep=False)

In [51]:
import numpy as np

data=pd.Series([1,2,3,-4,-5,6,4,2,5,8,-11,-14])
replaced_data = np.where(data < 0, 0, data)  
print(replaced_data)
masked_data = data.mask(data < 0, 0) 
print(masked_data)

[1 2 3 0 0 6 4 2 5 8 0 0]
0     1
1     2
2     3
3     0
4     0
5     6
6     4
7     2
8     5
9     8
10    0
11    0
dtype: int64


# data frames

In [52]:
import pandas as pd

data = {'Name': ['Alice', 'Bob', 'Charlie', 'David'],
        'Age': [25, 30, 35, 40],
        'Score': [85, 90, 75, 95]}

df = pd.DataFrame(data)
print(df)


      Name  Age  Score
0    Alice   25     85
1      Bob   30     90
2  Charlie   35     75
3    David   40     95


In [53]:
df

Unnamed: 0,Name,Age,Score
0,Alice,25,85
1,Bob,30,90
2,Charlie,35,75
3,David,40,95


In [54]:
data = [{'Name': 'Alice', 'Age': 25, 'Score': 85},
        {'Name': 'Bob', 'Age': 30, 'Score': 90},
        {'Name': 'Charlie', 'Age': 35, 'Score': 75},
        {'Name': 'David', 'Age': 40, 'Score': 95}]

df = pd.DataFrame(data)
print(df)


      Name  Age  Score
0    Alice   25     85
1      Bob   30     90
2  Charlie   35     75
3    David   40     95


Unnamed: 0,Name,Age,Score
0,Alice,25,85
1,Bob,30,90
2,Charlie,35,75
3,David,40,95


In [58]:
df = pd.DataFrame(data, columns=['Name', 'Age', 'Score'])
print(df)

      Name  Age  Score
0    Alice   25     85
1      Bob   30     90
2  Charlie   35     75
3    David   40     95


In [2]:
import pandas as pd 

df = pd.DataFrame(columns=['Name', 'Age', 'Score'])
df.loc[0] = ['Alice', 25, 85]
df.loc[1] = ['Bob', 30, 90]
df.loc[2] = ['Charlie', 35, 75]
df.loc[3] = ['David', 40, 95]
df.loc[4] = ['Devaraj', 65, 15]
print(df)


      Name  Age  Score
0    Alice   25     85
1      Bob   30     90
2  Charlie   35     75
3    David   40     95
4  Devaraj   65     15


In [4]:
df = pd.read_csv('student.csv')
print(df)

    id         name  class  mark  gender
0    1     John Deo   Four    75  female
1    2     Max Ruin  Three    85    male
2    3       Arnold  Three    55    male
3    4   Krish Star   Four    60  female
4    5    John Mike   Four    60  female
5    6    Alex John   Four    55    male
6    7  My John Rob  Fifth    78    male
7    8       Asruid   Five    85    male
8    9      Tes Qry    Six    78    male
9   10     Big John   Four    55  female
10  11       Ronald    Six    89  female
11  12        Recky    Six    94  female
12  13          Kty  Seven    88  female
13  14         Bigy  Seven    88  female
14  15     Tade Row   Four    88    male
15  16        Gimmy   Four    88    male
16  17        Tumyu    Six    54    male
17  18        Honny   Five    75    male
18  19        Tinny   Nine    18    male
19  20       Jackly   Nine    65  female
20  21   Babby John   Four    69  female
21  22       Reggid  Seven    55  female
22  23        Herod  Eight    79    male
23  24    Tiddy 

In [10]:
df["gender"].value_counts().sum()
df["gender"].value_counts().unique()[0]

18

In [63]:
print("Accessing a specific element:")
print(df.iloc[0, 1])  # Accessing the element at row 0, column 1

print("\nAccessing a specific row:")
print(df.iloc[0])  # Accessing the first row

print("\nAccessing a specific column:")
print(df['name'])  # Accessing the 'Name' column


Accessing a specific element:
John Deo

Accessing a specific row:
id               1
name      John Deo
class         Four
mark            75
gender      female
Name: 0, dtype: object

Accessing a specific column:
0        John Deo
1        Max Ruin
2          Arnold
3      Krish Star
4       John Mike
5       Alex John
6     My John Rob
7          Asruid
8         Tes Qry
9        Big John
10         Ronald
11          Recky
12            Kty
13           Bigy
14       Tade Row
15          Gimmy
16          Tumyu
17          Honny
18          Tinny
19         Jackly
20     Babby John
21         Reggid
22          Herod
23      Tiddy Now
24       Giff Tow
25         Crelea
26       Big Nose
27      Rojj Base
28    Tess Played
29      Reppy Red
30    Marry Toeey
31      Binn Rott
32      Kenn Rein
33       Gain Toe
34     Rows Noump
Name: name, dtype: object


In [66]:
filtered_df = df[df['mark'] > 45]
print(filtered_df)

    id         name  class  mark  gender
0    1     John Deo   Four    75  female
1    2     Max Ruin  Three    85    male
2    3       Arnold  Three    55    male
3    4   Krish Star   Four    60  female
4    5    John Mike   Four    60  female
5    6    Alex John   Four    55    male
6    7  My John Rob  Fifth    78    male
7    8       Asruid   Five    85    male
8    9      Tes Qry    Six    78    male
9   10     Big John   Four    55  female
10  11       Ronald    Six    89  female
11  12        Recky    Six    94  female
12  13          Kty  Seven    88  female
13  14         Bigy  Seven    88  female
14  15     Tade Row   Four    88    male
15  16        Gimmy   Four    88    male
16  17        Tumyu    Six    54    male
17  18        Honny   Five    75    male
19  20       Jackly   Nine    65  female
20  21   Babby John   Four    69  female
21  22       Reggid  Seven    55  female
22  23        Herod  Eight    79    male
23  24    Tiddy Now  Seven    78    male
24  25     Giff 

In [67]:
grouped_df = df.groupby('gender').count()
print(grouped_df)

        id  name  class  mark
gender                       
female  17    17     17    17
male    18    18     18    18


In [70]:
grouped_df = df.groupby('mark').sum()
grouped_df

Unnamed: 0_level_0,id,name,class,gender
mark,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
18,19,Tinny,Nine,male
54,17,Tumyu,Six,male
55,70,ArnoldAlex JohnBig JohnReggidTess Played,ThreeFourFourSevenSeven,malemalefemalefemalemale
60,9,Krish StarJohn Mike,FourFour,femalefemale
65,20,Jackly,Nine,female
69,55,Babby JohnGain Toe,FourSeven,femalemale
75,19,John DeoHonny,FourFive,femalemale
78,40,My John RobTes QryTiddy Now,FifthSixSeven,malemalemale
79,79,HerodCreleaReppy Red,EightSevenSix,malemalefemale
81,27,Big Nose,Three,female
