In [None]:
import re
import openai
from openai import OpenAI
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from bs4 import BeautifulSoup
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import numpy as np
from textblob import TextBlob
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
nltk.download('stopwords')
nltk.download('wordnet')

In [None]:
def random_duplicate_rows(df, num_rows, num_repeats):
    # 从数据框中随机选择行索引
    indices = np.random.choice(df.index, num_rows, replace=True)
    # 重复选中的行
    repeated_rows = df.loc[indices].sample(frac=1).reset_index(drop=True)
    # 多次重复这些行
    duplicates = pd.concat([repeated_rows] * num_repeats, ignore_index=True)
    # 将重复的行附加到原始数据框中
    df_extended = pd.concat([df, duplicates]).reset_index(drop=True)
    return df_extended

In [None]:
# social_media_data = social_media_data.sample(100)
# social_media_data = random_duplicate_rows(social_media_data, 10, 10)
# social_media_data["IID"] = np.random.randint(10000,20000,social_media_data.shape[0])
# output_file = 'output.csv'
# social_media_data.to_csv(output_file, index=False)

In [None]:
# 加载社交媒体数据
social_media_data = pd.read_csv('output.csv')

# 查看前几行数据
social_media_data.shape

In [None]:
def look_length(data):
    data['comment_length'] = data['SentimentText'].apply(len)
    plt.figure(figsize=(10, 6))
    sns.histplot(data['comment_length'], bins=20, kde=True)
    plt.title('Distribution of Comment Length')
    plt.xlabel('Comment Length')
    plt.ylabel('Frequency')
    plt.show()

def lookF(data):
        # 词袋模型
    vectorizer = CountVectorizer(max_features=1000)
    X_bow = vectorizer.fit_transform(data['SentimentText'])

    # 可视化词频
    # 获取词汇表和对应的词频
    word_freq = dict(zip(vectorizer.get_feature_names_out(), X_bow.sum(axis=0).tolist()[0]))

    # 将词频字典转换为DataFrame以便使用Seaborn绘图
    word_freq_df = pd.DataFrame(list(word_freq.items()), columns=['word', 'frequency'])

    # 按频率降序排序并选择前20个单词
    top_words = word_freq_df.nlargest(20, 'frequency')

    # 可视化
    plt.figure(figsize=(10, 6))
    sns.barplot(x='frequency', y='word', data=top_words)
    plt.title('Top 20 Most Frequent Words')
    plt.xlabel('Frequency')
    plt.ylabel('Words')
    plt.show()

In [None]:
look_length(social_media_data)

In [None]:
social_media_data.dropna(subset=['SentimentText'], inplace=True)
new_data = social_media_data.copy()
# social_media_data.drop_duplicates(subset=['SentimentText'], inplace=True)

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    text = text.lower()
    text = BeautifulSoup(text, "html.parser").get_text()
    text = re.sub('[^a-zA-Z]', ' ', text)
    text = re.sub('\s+', ' ', text).strip()
    words = text.split()
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return ' '.join(words)

lookF(social_media_data)

social_media_data['Text'] = social_media_data['SentimentText'].apply(clean_text)

vectorizer = CountVectorizer(max_features=1000)
X_bow = vectorizer.fit_transform(social_media_data['Text'])

# 可视化词频
# 获取词汇表和对应的词频
word_freq = dict(zip(vectorizer.get_feature_names_out(), X_bow.sum(axis=0).tolist()[0]))

# 将词频字典转换为DataFrame以便使用Seaborn绘图
word_freq_df = pd.DataFrame(list(word_freq.items()), columns=['word', 'frequency'])

# 按频率降序排序并选择前20个单词
top_words = word_freq_df.nlargest(20, 'frequency')

# 可视化
plt.figure(figsize=(10, 6))
sns.barplot(x='frequency', y='word', data=top_words)
plt.title('Top 20 Most Frequent Words')
plt.xlabel('Frequency')
plt.ylabel('Words')
plt.show()

In [None]:
# 情感分析
def get_sentiment(text):
    analysis = TextBlob(text)
    return analysis.sentiment.polarity

social_media_data['sentiment'] = social_media_data['SentimentText'].apply(get_sentiment)
social_media_data['comment_length'] = social_media_data['SentimentText'].apply(len)


# 清洗前的词频
plt.figure(figsize=(10, 6))
sns.histplot(social_media_data['sentiment'], bins=50, kde=True)
plt.title('Sentiment of Comments')
plt.xlabel('Sentiment')
plt.ylabel('Frequency')
plt.show()

In [None]:
def analyze_emotion(text):
    try:
        client = OpenAI(
            api_key="sk-qoVw2NSrjOFwER6d71E73a5c8984470494Dd9dD19d5d7bDe",
            base_url="https://api.bianxieai.com/v1"
        )
        response = client.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": f"Analyze the emotion of the following text:\n\n{text}\n Make sure you only give emotion words, or None if you can't parse them"}
            ]
        )
        emotion_analysis = response.choices[0].message.content
        return emotion_analysis
    
    except openai.OpenAIError as e:
        print(f"An error occurred: {e}")
        return None

data = social_media_data['SentimentText'].head(10).apply(analyze_emotion)
data

In [None]:
new_data = social_media_data.copy()
new_data.drop_duplicates(subset=['SentimentText'], inplace=True)
new_data.shape