https://levelup.gitconnected.com/python-pandas-tutorial-a-comprehensive-guide-to-data-manipulation-and-analysis-219f108018d0

In [1]:
import pandas as pd

In [2]:
# Creating a Series from a list
data_list = [10, 20, 30, 40, 50]
series_from_list = pd.Series(data_list)
print(series_from_list)

# Creating a Series from a NumPy array
import numpy as np
data_array = np.array([1.1, 2.2, 3.3, 4.4, 5.5])
series_from_array = pd.Series(data_array)
print(series_from_array)

# Creating a Series from a dictionary
data_dict = {'A': 100, 'B': 200, 'C': 300, 'D': 400}
series_from_dict = pd.Series(data_dict)
print(series_from_dict)

0    10
1    20
2    30
3    40
4    50
dtype: int64
0    1.1
1    2.2
2    3.3
3    4.4
4    5.5
dtype: float64
A    100
B    200
C    300
D    400
dtype: int64


In [3]:
# Accessing elements by index
print(series_from_list[0])
print(series_from_array[2])
print(series_from_dict['B'])

# Accessing elements by label-based index
print(series_from_list.iloc[1])
print(series_from_array.loc[2])

10
3.3
200
20
3.3


In [4]:
# Mathematical operations
result = series_from_list + series_from_array
print(result)

# Conditional filtering
filtered_series = series_from_list[series_from_list > 20]
print(filtered_series)

# Check for null values
print(series_from_dict.isnull())

0    11.1
1    22.2
2    33.3
3    44.4
4    55.5
dtype: float64
2    30
3    40
4    50
dtype: int64
A    False
B    False
C    False
D    False
dtype: bool


In [5]:
# Creating a DataFrame from a dictionary
data_dict = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David'],
    'Age': [25, 30, 35, 40],
    'City': ['New York', 'London', 'Paris', 'Tokyo']
}
df = pd.DataFrame(data_dict)
print(df)

# Creating a DataFrame from a list of dictionaries
data_list_of_dicts = [
    {'Name': 'Alice', 'Age': 25, 'City': 'New York'},
    {'Name': 'Bob', 'Age': 30, 'City': 'London'},
    {'Name': 'Charlie', 'Age': 35, 'City': 'Paris'},
    {'Name': 'David', 'Age': 40, 'City': 'Tokyo'}
]
df_from_list_of_dicts = pd.DataFrame(data_list_of_dicts)
print(df_from_list_of_dicts)

# Creating a DataFrame from external data (e.g., CSV file)
# df = pd.read_csv('data.csv')

      Name  Age      City
0    Alice   25  New York
1      Bob   30    London
2  Charlie   35     Paris
3    David   40     Tokyo
      Name  Age      City
0    Alice   25  New York
1      Bob   30    London
2  Charlie   35     Paris
3    David   40     Tokyo


In [6]:
# Displaying basic information about the DataFrame
print(df.info())

# Displaying summary statistics
print(df.describe())

# Accessing specific columns
print(df['Name'])
print(df.Age)

# Accessing rows using iloc (integer-location based indexing)
print(df.iloc[0])  # First row
print(df.iloc[1:3])  # Rows 2 and 3

# Accessing rows using loc (label-location based indexing)
print(df.loc[0])  # First row
print(df.loc[1:2])  # Rows 2 and 3

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Name    4 non-null      object
 1   Age     4 non-null      int64 
 2   City    4 non-null      object
dtypes: int64(1), object(2)
memory usage: 224.0+ bytes
None
             Age
count   4.000000
mean   32.500000
std     6.454972
min    25.000000
25%    28.750000
50%    32.500000
75%    36.250000
max    40.000000
0      Alice
1        Bob
2    Charlie
3      David
Name: Name, dtype: object
0    25
1    30
2    35
3    40
Name: Age, dtype: int64
Name       Alice
Age           25
City    New York
Name: 0, dtype: object
      Name  Age    City
1      Bob   30  London
2  Charlie   35   Paris
Name       Alice
Age           25
City    New York
Name: 0, dtype: object
      Name  Age    City
1      Bob   30  London
2  Charlie   35   Paris


In [7]:
# Selecting a single column
name_column = df['Name']
print(name_column)

# Selecting multiple columns
name_age_columns = df[['Name', 'Age']]
print(name_age_columns)

0      Alice
1        Bob
2    Charlie
3      David
Name: Name, dtype: object
      Name  Age
0    Alice   25
1      Bob   30
2  Charlie   35
3    David   40


In [8]:
# Filtering rows based on a condition
filtered_df = df[df['Age'] > 30]
print(filtered_df)

# Multiple conditions (use & for AND, | for OR)
filtered_df = df[(df['Age'] > 25) & (df['City'] == 'New York')]
print(filtered_df)

      Name  Age   City
2  Charlie   35  Paris
3    David   40  Tokyo
Empty DataFrame
Columns: [Name, Age, City]
Index: []


In [9]:
# Adding a new column
df['Salary'] = [50000, 60000, 70000, 80000]
print(df)

# Removing a column
df = df.drop('Salary', axis=1)  # axis=1 means column-wise operation
print(df)

      Name  Age      City  Salary
0    Alice   25  New York   50000
1      Bob   30    London   60000
2  Charlie   35     Paris   70000
3    David   40     Tokyo   80000
      Name  Age      City
0    Alice   25  New York
1      Bob   30    London
2  Charlie   35     Paris
3    David   40     Tokyo


In [11]:
# Applying a function to a column
df['Age'] = df['Age'].apply(lambda x: x + 5)
print(df)

      Name  Age      City
0    Alice   30  New York
1      Bob   35    London
2  Charlie   40     Paris
3    David   45     Tokyo


In [13]:
# Dropping duplicates
df.drop_duplicates(subset=['Name'], keep='first', inplace=True)

In [14]:
# Creating a DataFrame with missing data
data = {'A': [1, 2, np.nan, 4, 5],
        'B': [6, np.nan, 8, 9, 10],
        'C': [11, 12, 13, np.nan, 15]}
df = pd.DataFrame(data)
print(df)

# Drop rows with any missing values
df.dropna(inplace=True)
print(df)

# Fill missing values with a specific value
df.fillna(0, inplace=True)
print(df)

# Fill missing values using interpolation
df['A'] = df['A'].interpolate()
print(df)

     A     B     C
0  1.0   6.0  11.0
1  2.0   NaN  12.0
2  NaN   8.0  13.0
3  4.0   9.0   NaN
4  5.0  10.0  15.0
     A     B     C
0  1.0   6.0  11.0
4  5.0  10.0  15.0
     A     B     C
0  1.0   6.0  11.0
4  5.0  10.0  15.0
     A     B     C
0  1.0   6.0  11.0
4  5.0  10.0  15.0


In [15]:
# Sorting by a single column
df.sort_values(by='B', inplace=True)
print(df)

# Sorting by multiple columns
df.sort_values(by=['B', 'C'], ascending=[False, True], inplace=True)
print(df)

     A     B     C
0  1.0   6.0  11.0
4  5.0  10.0  15.0
     A     B     C
4  5.0  10.0  15.0
0  1.0   6.0  11.0


In [18]:
# Creating a DataFrame from a list of dictionaries
data_list_of_dicts = [
    {'Name': 'Alice', 'Age': 25, 'City': 'New York'},
    {'Name': 'Bob', 'Age': 30, 'City': 'London'},
    {'Name': 'Charlie', 'Age': 35, 'City': 'Paris'},
    {'Name': 'David', 'Age': 40, 'City': 'Tokyo'}
]
df = pd.DataFrame(data_list_of_dicts)
print(df)
# Grouping by a single column and calculating the mean of each group
grouped_df = df.groupby('City')['Age'].mean()
print(grouped_df)

# Grouping by multiple columns and calculating the sum of each group
#grouped_df = df.groupby(['City', 'Gender'])['Salary'].sum()
#print(grouped_df)

      Name  Age      City
0    Alice   25  New York
1      Bob   30    London
2  Charlie   35     Paris
3    David   40     Tokyo
City
London      30.0
New York    25.0
Paris       35.0
Tokyo       40.0
Name: Age, dtype: float64


In [19]:
# Creating a DataFrame with time series data
date_rng = pd.date_range(start='2023-01-01', end='2023-01-10', freq='D')
data = {'Date': date_rng,
        'Temperature': [15, 20, 22, 18, 25, 28, 30, 29, 27, 23]}
df = pd.DataFrame(data)
print(df)

# Setting the 'Date' column as the DataFrame's index
df.set_index('Date', inplace=True)

# Resampling time series data
weekly_mean = df.resample('W').mean()
print(weekly_mean)

# Rolling window operations
rolling_mean = df['Temperature'].rolling(window=3).mean()
print(rolling_mean)

        Date  Temperature
0 2023-01-01           15
1 2023-01-02           20
2 2023-01-03           22
3 2023-01-04           18
4 2023-01-05           25
5 2023-01-06           28
6 2023-01-07           30
7 2023-01-08           29
8 2023-01-09           27
9 2023-01-10           23
            Temperature
Date                   
2023-01-01    15.000000
2023-01-08    24.571429
2023-01-15    25.000000
Date
2023-01-01          NaN
2023-01-02          NaN
2023-01-03    19.000000
2023-01-04    20.000000
2023-01-05    21.666667
2023-01-06    23.666667
2023-01-07    27.666667
2023-01-08    29.000000
2023-01-09    28.666667
2023-01-10    26.333333
Name: Temperature, dtype: float64
