In [None]:
#Bài 1: Creating Series and DataFrame
import pandas as pd
import numpy as np

series_data = pd.Series([1, 2, 3, 4, 5])
print("Series:")
print(series_data)
print()

name = ['An', 'Bình', 'Châu', 'Dũng']
score = [8.5, 7.0, 6.5, 9.0]

df = pd.DataFrame({
    'name': name,
    'score': score
})

print("DataFrame:")
print(df)
print()

print("DataFrame info:")
df.info()
print()

print("DataFrame describe:")
print(df.describe())

In [None]:
#Bài 2: Indexing, Filtering with loc[], iloc[], query()
import pandas as pd

name = ['An', 'Bình', 'Châu', 'Dũng']
age = [20, 21, 22, 20]
score = [8.5, 7.0, 6.5, 9.0]

df = pd.DataFrame({
    'name': name,
    'age': age,
    'score': score
})

print("Original DataFrame:")
print(df)
print()

print("DataFrame filtered by loc[] (score >= 8):")
filtered_loc = df.loc[df['score'] >= 8]
print(filtered_loc)
print()

print("DataFrame filtered by iloc[] (age column):")
age_column = df[['age']]  # Using double brackets to keep DataFrame format
print(age_column)
print()

print("DataFrame filtered by query() (score > 7 and age < 23):")
filtered_query = df.query('score > 7 and age < 23')
print(filtered_query)

In [None]:
#Bài 3: Handling Missing Data - isna(), fillna(), dropna()
import pandas as pd
import numpy as np

name = ['An', 'Bình', 'Châu', 'Dũng']
age = [20, None, 22, 20]
score = [8.5, None, 6.5, 9.0]

df = pd.DataFrame({
    'name': name,
    'age': age,
    'score': score
})

print("Original DataFrame with missing values:")
print(df)
print()

print("Dữ liệu thiếu kiểm tra bằng isna():")
print(df.isna())
print()

df_filled = df.copy()
df_filled['age'].fillna(df_filled['age'].mean(), inplace=True)
df_filled['score'].fillna(df_filled['score'].mean(), inplace=True)

print("DataFrame sau khi thay thế NaN bằng fillna():")
print(df_filled)
print()

df_dropped = df.dropna()
print("DataFrame sau khi loại bỏ NaN bằng dropna():")
print(df_dropped)

In [None]:
#Bài 4: Data Transformation - apply(), map(), astype(), replace()
import pandas as pd

name = ['An', 'Bình', 'Châu', 'Dũng']
age = [20, 21, 22, 20]
score = [8.5, 7.0, 6.5, 9.0]

df = pd.DataFrame({
    'name': name,
    'age': age,
    'score': score
})

print("Original DataFrame:")
print(df)
print()

df['age_group'] = df['age'].apply(lambda x: 'Young' if x <= 20 else 'Adult')
print("DataFrame sau khi áp dụng apply():")
print(df)
print()

score_mapping = {8.5: 'Excellent', 7.0: 'Good', 6.5: 'Average', 9.0: 'Excellent'}
df['score_level'] = df['score'].map(score_mapping)
print("DataFrame sau khi map():")
print(df[['name', 'age', 'score', 'score_level']])
print()

df_int = df.copy()
df_int['score'] = df_int['score'].astype(int)
print("DataFrame sau khi astype():")
print(df_int[['name', 'age', 'score']])
print()

df_replaced = df.copy()
df_replaced['name'] = df_replaced['name'].replace('Bình', 'Bình (Updated)')
print("DataFrame sau khi replace():")
print(df_replaced[['name', 'age', 'score']])

In [9]:
#Bài 5: Group Operations - groupby(), agg()
import pandas as pd

name = ['An', 'Bình', 'Châu', 'Dũng', 'Em']
age = [20, 21, 22, 20, 21]
score = [8.5, 7.0, 6.5, 9.0, 8.0]

df = pd.DataFrame({
    'name': name,
    'age': age,
    'score': score
})

print("Original DataFrame:")
print(df)
print()

grouped = df.groupby('age')

agg_result = grouped['score'].agg(['mean', 'min', 'max']).reset_index()
agg_result.columns = ['age', 'mean_score', 'min_score', 'max_score']

print("Groupby age:")
print(agg_result)
print()

print("Chi tiết từng nhóm tuổi:")
for age, group in grouped:
    print(f"Age {age}:")
    print(group)
    print(f"Mean score: {group['score'].mean():.2f}")
    print(f"Min score: {group['score'].min()}")
    print(f"Max score: {group['score'].max()}")
    print("-" * 30)

Original DataFrame:
   name  age  score
0    An   20    8.5
1  Bình   21    7.0
2  Châu   22    6.5
3  Dũng   20    9.0
4    Em   21    8.0

Groupby age:
   age  mean_score  min_score  max_score
0   20        8.75        8.5        9.0
1   21        7.50        7.0        8.0
2   22        6.50        6.5        6.5

Chi tiết từng nhóm tuổi:
Age 20:
   name  age  score
0    An   20    8.5
3  Dũng   20    9.0
Mean score: 8.75
Min score: 8.5
Max score: 9.0
------------------------------
Age 21:
   name  age  score
1  Bình   21    7.0
4    Em   21    8.0
Mean score: 7.50
Min score: 7.0
Max score: 8.0
------------------------------
Age 22:
   name  age  score
2  Châu   22    6.5
Mean score: 6.50
Min score: 6.5
Max score: 6.5
------------------------------


In [None]:
#Bài 6: Combining DataFrames - merge(), concat()
import pandas as pd

df1 = pd.DataFrame({
    'id': [1, 2, 3], 
    'name': ['An', 'Bình', 'Châu']
})

df2 = pd.DataFrame({
    'id': [1, 2, 3], 
    'score': [8.5, 7.0, 6.5]
})

print("DataFrame df1 (student info):")
print(df1)
print()

print("DataFrame df2 (score info):")
print(df2)
print()

merged_df = pd.merge(df1, df2, on='id')
print("Merged DataFrame:")
print(merged_df)
print()

df3 = pd.DataFrame({
    'id': [1, 2, 3],
    'age': [20, 21, 22]
})

final_df = pd.merge(merged_df, df3, on='id')
print("Concatenated DataFrame:")
print(final_df)

In [None]:
#Bài 7: DateTime Operations and Data Type Conversion
import pandas as pd

date = ['2023-01-01', '2023-02-01', '2023-03-01']
name = ['An', 'Bình', 'Châu']

df = pd.DataFrame({
    'date': date,
    'name': name
})

print("Original DataFrame:")
print(df)
print("Data types:")
print(df.dtypes)
print()

df['date'] = pd.to_datetime(df['date'])
print("Date converted to datetime:")
print(df)
print("Data types after conversion:")
print(df.dtypes)
print()

print("Extracted Year:")
print(df['date'].dt.year)
print()

print("Extracted Month:")
print(df['date'].dt.month)
print()

print("Extracted Day:")
print(df['date'].dt.day)
print()

df['name'] = df['name'].astype('category')
print("Category Type for name:")
print(df['name'])
print()

print("Memory usage comparison:")
print(f"Original string: {df.select_dtypes(include=['object']).memory_usage().sum()} bytes")
print(f"Category: {df['name'].memory_usage()} bytes")

In [11]:
#Bài 8: File Processing - JSON logs analysis
import pandas as pd
import json
import os
from datetime import datetime

logs_dir = 'logs'
if not os.path.exists(logs_dir):
    os.makedirs(logs_dir)

log_data1 = [
    {"user": "A", "action": "login", "duration": 5},
    {"user": "B", "action": "view", "duration": 3},
    {"user": "A", "action": "logout", "duration": 1}
]

log_data2 = [
    {"user": "B", "action": "login", "duration": 2},
    {"user": "C", "action": "view", "duration": 4},
    {"user": "B", "action": "logout", "duration": 1}
]

with open('logs/log_2024-01-01.json', 'w') as f:
    json.dump(log_data1, f)

with open('logs/log_2024-01-02.json', 'w') as f:
    json.dump(log_data2, f)

print("Created sample log files:")
print("logs/log_2024-01-01.json")
print("logs/log_2024-01-02.json")
print()

all_data = []

for filename in os.listdir(logs_dir):
    if filename.endswith('.json'):
        date_str = filename.replace('log_', '').replace('.json', '')
        date = pd.to_datetime(date_str)
        
        with open(os.path.join(logs_dir, filename), 'r') as f:
            logs = json.load(f)
        
        for log in logs:
            log['date'] = date
            all_data.append(log)

df_logs = pd.DataFrame(all_data)
print("All log data:")
print(df_logs)
print()

daily_summary = df_logs.groupby('date')['duration'].sum().reset_index()
daily_summary['date_str'] = daily_summary['date'].dt.strftime('%Y-%m-%d')

print("Daily summary:")
for _, row in daily_summary.iterrows():
    print(f"Date: {row['date_str']}, Total Duration: {row['duration']}")

daily_summary.to_csv('daily_duration_report.csv', index=False)
print()
print("Report exported to daily_duration_report.csv")

print("\nCSV Content:")
print(pd.read_csv('daily_duration_report.csv'))

Created sample log files:
logs/log_2024-01-01.json
logs/log_2024-01-02.json

All log data:
  user  action  duration       date
0    A   login         5 2024-01-01
1    B    view         3 2024-01-01
2    A  logout         1 2024-01-01
3    B   login         2 2024-01-02
4    C    view         4 2024-01-02
5    B  logout         1 2024-01-02

Daily summary:
Date: 2024-01-01, Total Duration: 9
Date: 2024-01-02, Total Duration: 7

Report exported to daily_duration_report.csv

CSV Content:
         date  duration    date_str
0  2024-01-01         9  2024-01-01
1  2024-01-02         7  2024-01-02
