In [12]:
import pandas as pd
import matplotlib.pyplot as plt
import chardet

# 检测文件编码
def detect_encoding(file_path):
    with open(file_path, 'rb') as f:
        result = chardet.detect(f.read())
        encoding = result['encoding']
        print(f"检测到的编码: {encoding}")
        return encoding

# 尝试读取CSV文件并处理异常
def read_csv(file_path, encoding):
    try:
        df = pd.read_csv(file_path, encoding=encoding)  # 从CSV文件读取数据
        print("文件读取成功")
        return df
    except FileNotFoundError as e:
        print(f"文件未找到: {e}")
        exit()
    except pd.errors.EmptyDataError as e:
        print(f"发现空数据: {e}")
        exit()
    except UnicodeDecodeError as e:
        print(f"编码错误: {e}。尝试使用不同的编码重新读取文件。")
        exit()

# 确保event_time列是datetime类型
def ensure_datetime(df, column_name):
    try:
        df[column_name] = pd.to_datetime(df[column_name])
    except ValueError as e:
        print(f"日期时间转换错误: {e}")
        exit()

# 分析国家和地区分布
def country_distribution(df):
    country_distribution = df['country'].value_counts()
    print("国家分布:\n", country_distribution)

# 分析城市级别分布
def city_distribution(df):
    city_distribution = df['location'].value_counts()
    print("\n城市分布:\n", city_distribution)

# 分析提交频率
def submission_frequency(df):
    submission_frequency = df['user_id'].value_counts()
    print("\n提交频率:\n", submission_frequency)

# 分析活跃时间段
def active_hours(df):
    ensure_datetime(df, 'event_time')  # 确保event_time列是datetime类型
    df['hour'] = df['event_time'].dt.hour
    active_hours = df['hour'].value_counts().sort_index()
    print("\n活跃时间段:\n", active_hours)

# 主函数
def main():
    file_path = 'C:/Users/y2209/Desktop/users_combined_info_500_part_7.csv'
    encoding = detect_encoding(file_path)
    
    df = read_csv(file_path, encoding)
    
    country_distribution(df)
    city_distribution(df)
    submission_frequency(df)
    active_hours(df)

    
if __name__ == "__main__":
    main()

检测到的编码: utf-8
文件读取成功
国家分布:
 country
United States     28136
Canada            15612
Germany           12983
France             8581
China              8135
Netherlands        5574
Norway             4513
Poland             4496
Italy              4240
United Kingdom     4101
Switzerland        3833
Spain              3800
Australia          3225
Belgium            2610
Slovenia           2589
Sweden             2531
Denmark            1668
New Zealand        1366
Lithuania          1186
South Africa        973
Israel              922
Name: count, dtype: int64

城市分布:
 location
Oslo, Norway                      4513
Zurich, Switzerland               3833
Montreal, Québec, Canada          3779
France                            3625
Paris, France                     3356
                                  ... 
Rome, Italy                        743
Redmond, WA                        727
北京                                 644
Delaware (but Philly at heart)     599
San Francisco              

In [None]:
import pandas as pd
import chardet

# 检测文件编码
def detect_encoding(file_path):
    with open(file_path, 'rb') as f:
        result = chardet.detect(f.read())
        encoding = result['encoding']
        print(f"检测到的编码: {encoding}")
        return encoding

# 尝试读取CSV文件并处理异常
def read_csv(file_path, encoding):
    try:
        df = pd.read_csv(file_path, encoding=encoding)  # 从CSV文件读取数据
        print("文件读取成功")
        return df
    except FileNotFoundError as e:
        print(f"文件未找到: {e}")
        exit()
    except pd.errors.EmptyDataError as e:
        print(f"发现空数据: {e}")
        exit()
    except UnicodeDecodeError as e:
        print(f"编码错误: {e}。尝试使用不同的编码重新读取文件。")
        exit()

# 国家和地区分布
def country_distribution(df, output_path):
    country_distribution = df['country'].value_counts()
    country_distribution.to_csv(output_path, header=['Count'], index_label='Country')
    print(f"国家分布已保存到CSV文件：{output_path}")

# 城市级别分布
def city_distribution(df, output_path):
    city_distribution = df['location'].value_counts()
    city_distribution.to_csv(output_path, header=['Count'], index_label='City')
    print(f"城市分布已保存到CSV文件：{output_path}")



# 提交频率
def submission_frequency(df, output_path):
    user_submission_frequency = df['user_id'].value_counts()
    user_submission_frequency.to_csv(output_path, header=['Count'], index_label='User ID')
    print(f"提交频率已保存到CSV文件：{output_path}")

# 活跃时间段分析
def active_hours(df, output_path):
    df['event_time'] = pd.to_datetime(df['event_time'])
    df['hour'] = df['event_time'].dt.hour
    active_hours = df['hour'].value_counts().sort_index()
    active_hours.to_csv(output_path, header=['Count'], index_label='Hour')
    print(f"活跃时间段已保存到CSV文件：{output_path}")

# 用户活跃度随时间变化
def user_activity_over_time(df, output_path):
    df['event_time'] = pd.to_datetime(df['event_time'])
    user_activity_over_time = df.resample('M', on='event_time')['user_id'].count()
    user_activity_over_time.to_csv(output_path, header=['Count'], index_label='Month')
    print(f"用户活跃度随时间变化已保存到CSV文件：{output_path}")

# 主函数
def main():
    file_path = 'C:/Users/y2209/Desktop/users_combined_info_500_part_1_output.csv'
    output_dir = 'C:/Users/y2209/Desktop/'  # 输出文件的目录

    # 检测编码并读取文件
    encoding = detect_encoding(file_path)
    df = read_csv(file_path, encoding)
    
    # 进行分析并保存结果
    country_distribution(df, output_dir + 'country_distribution.csv')
    city_distribution(df, output_dir + 'city_distribution.csv')
    submission_frequency(df, output_dir + 'submission_frequency.csv')
    active_hours(df, output_dir + 'active_hours.csv')
    user_activity_over_time(df, output_dir + 'user_activity_over_time.csv')

if __name__ == "__main__":
    main()

检测到的编码: utf-8
文件读取成功
国家分布已保存到CSV文件：C:/Users/y2209/Desktop/country_distribution.csv


KeyError: 'city'