In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from scipy.stats import ttest_ind
import ace_tools as tools  # Custom display function

# Define relevant columns for analysis
relevant_columns = [
    "3、您如何描述您的个性？ ",
    "4、您喜欢哪种电影类型？ ",
    "6、您对科学或技术进步的好奇程度有多强？",
    "5、您有多喜欢情感丰富的内容？ ",
    "8、您对探索新想法和概念的开放程度如何？ ",
    "7、您对社会关系或人类学的兴趣有多大？",
    "10、如果可以选择，您会选择哪一段视频观看？"
]

# Map text ratings to numeric values
rating_mapping = {
    "很不满意": 1,
    "不满意": 2,
    "一般": 3,
    "满意": 4,
    "很满意": 5
}

# Define columns for ratings
scifi_rating_columns = ["11、您有多喜欢这段视频？", "12、这次观看是否让您更欣赏这种类型或题材的内容？", "13、根据刚才的观看体验，您觉得未来自己会有多大意愿主动寻找类似的视频"]
romance_rating_columns = ["16、您有多喜欢这段视频？", "17、您认为视频这次观看是否让您更欣赏这种类型或题材的内容？", "18、根据刚才的观看体验，您觉得未来自己会有多大意愿主动寻找类似的视频"]


data_filtered = df[relevant_columns].dropna()

# Preprocess Sci-Fi Data
data_filtered_scifi = df[scifi_rating_columns + relevant_columns].dropna(subset=["11、您有多喜欢这段视频？"])
data_filtered_scifi["SciFi_Rating_Mean"] = data_filtered_scifi[scifi_rating_columns].apply(lambda x: x.map(rating_mapping)).mean(axis=1)
for col in relevant_columns[:-1]:
    data_filtered_scifi[col] = LabelEncoder().fit_transform(data_filtered_scifi[col].astype(str))

# Preprocess Romance Data
data_filtered_romance = df[romance_rating_columns + relevant_columns].dropna(subset=["16、您有多喜欢这段视频？"])
data_filtered_romance["Romance_Rating_Mean"] = data_filtered_romance[romance_rating_columns].apply(lambda x: x.map(rating_mapping)).mean(axis=1)
for col in relevant_columns[:-1]:
    data_filtered_romance[col] = LabelEncoder().fit_transform(data_filtered_romance[col].astype(str))


Key Predictor for Video Choice

In [None]:
# Adjust the relevant columns to include question 4
relevant_columns = [
    "3、您如何描述您的个性？ ",
    "4、您喜欢哪种电影类型？ ",
    "6、您对科学或技术进步的好奇程度有多强？",
    "5、您有多喜欢情感丰富的内容？ ",
    "8、您对探索新想法和概念的开放程度如何？ ",
    "7、您对社会关系或人类学的兴趣有多大？",
    "10、如果可以选择，您会选择哪一段视频观看？"
]

# Filter and preprocess the dataset with the corrected columns including question 4
data_filtered = df[relevant_columns].dropna()

# Encode categorical variables for features and target
label_encoders = {}
for col in relevant_columns:
    le = LabelEncoder()
    data_filtered[col] = le.fit_transform(data_filtered[col].astype(str))
    label_encoders[col] = le

# Split data into features and target
X = data_filtered.iloc[:, :-1]  # Features: questions 3 to 8 (including question 4 now)
y = data_filtered.iloc[:, -1]  # Target: video choice (sci-fi or romance)

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Random Forest Classifier
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_train)

# Get feature importances
importances = clf.feature_importances_
feature_importances = pd.DataFrame({
    'Feature': relevant_columns[:-1],
    'Importance': importances
}).sort_values(by='Importance', ascending=False)

# Display the updated feature importance data in a structured format
tools.display_dataframe_to_user(name="Updated Key Predictors For Video Choice", dataframe=feature_importances)


Key Predictor for Sci-Fi Rating

In [None]:
# Ensure all feature columns are encoded properly as numerical data
for col in relevant_columns[:-1]:
    data_filtered_scifi[col] = label_encoders[col].transform(data_filtered_scifi[col].astype(str))

# Prepare the data again with corrected encoding
X_scifi = data_filtered_scifi[relevant_columns[:-1]]  # Features
y_scifi = data_filtered_scifi['Scifi_Rating_Mean']    # Target: mean rating for sci-fi videos

# Split into training and testing sets
X_train_scifi, X_test_scifi, y_train_scifi, y_test_scifi = train_test_split(
    X_scifi, y_scifi, test_size=0.2, random_state=42
)

# Train the Random Forest Regressor again
regressor = RandomForestRegressor(random_state=42)
regressor.fit(X_train_scifi, y_train_scifi)

# Get feature importances for predicting sci-fi ratings
importances_scifi = regressor.feature_importances_
feature_importances_scifi = pd.DataFrame({
    'Feature': relevant_columns[:-1],
    'Importance': importances_scifi
}).sort_values(by='Importance', ascending=False)

# Display the corrected feature importance data for sci-fi ratings
tools.display_dataframe_to_user(name="Key Predictors For Sci-Fi Rating", dataframe=feature_importances_scifi)


Key Predictor for Romance Rating

In [None]:
# Ensure all feature columns are encoded properly as numerical data
for col in relevant_columns[:-1]:
    data_filtered_romance[col] = label_encoders[col].transform(data_filtered_romance[col].astype(str))

# Prepare the data again with corrected encoding
X_romance = data_filtered_romance[relevant_columns[:-1]]  # Features
y_romance = data_filtered_romance['Romance_Rating_Mean']  # Target: mean rating for romance videos

# Split into training and testing sets
X_train_romance, X_test_romance, y_train_romance, y_test_romance = train_test_split(
    X_romance, y_romance, test_size=0.2, random_state=42
)

# Train the Random Forest Regressor again
regressor_romance = RandomForestRegressor(random_state=42)
regressor_romance.fit(X_train_romance, y_train_romance)

# Get feature importances for predicting romance ratings
importances_romance = regressor_romance.feature_importances_
feature_importances_romance = pd.DataFrame({
    'Feature': relevant_columns[:-1],
    'Importance': importances_romance
}).sort_values(by='Importance', ascending=False)

# Display the corrected feature importance data for romance ratings
tools.display_dataframe_to_user(name="Key Predictors For Romance Rating", dataframe=feature_importances_romance)


Mean Ratings with P values for Sci-Fi 

In [None]:
# Recalculate mean ratings for Sci-Fi participants
# Calculate mean rating for sci-fi for all participants
mean_scifi_all = data_filtered_scifi[scifi_rating_columns].mean(axis=1).mean()

# Calculate mean rating for sci-fi for participants who chose sci-fi in question 10
scifi_choosers = data_filtered_scifi[
    data_filtered_scifi["10、如果可以选择，您会选择哪一段视频观看？"] ==
    label_encoders["10、如果可以选择，您会选择哪一段视频观看？"].transform(['科幻'])[0]
]
mean_scifi_choosers = scifi_choosers[scifi_rating_columns].mean(axis=1).mean()

# Perform t-test for sci-fi
t_stat_scifi, p_value_scifi = ttest_ind(
    data_filtered_scifi[scifi_rating_columns].mean(axis=1),
    scifi_choosers[scifi_rating_columns].mean(axis=1),
    nan_policy='omit'
)

# Prepare sci-fi results
sci_fi_results = {
    "Group": ["All Participants (Sci-Fi)", "Sci-Fi Lovers"],
    "Mean Rating": [mean_scifi_all, mean_scifi_choosers],
    "T-Statistic": [t_stat_scifi, t_stat_scifi],
    "P-Value": [p_value_scifi, p_value_scifi]
}

# Display sci-fi results
sci_fi_df = pd.DataFrame(sci_fi_results)
tools.display_dataframe_to_user(name="Mean Ratings For Sci-Fi Videos", dataframe=sci_fi_df)


Mean Ratings with P values for Romance

In [None]:
# Calculate mean rating for all romance participants
mean_romance_all = df[romance_rating_columns].apply(lambda x: x.map(rating_mapping)).mean(axis=1).mean()

# Calculate mean rating for participants who chose "浪漫" in question 10
mean_romance_choosers = romance_choosers[romance_rating_columns].mean(axis=1).mean()

# Perform t-test for romance
t_stat_romance, p_value_romance = ttest_ind(
    df[romance_rating_columns].apply(lambda x: x.map(rating_mapping)).mean(axis=1),
    romance_choosers[romance_rating_columns].mean(axis=1),
    nan_policy='omit'
)

# Prepare romance results
romance_results = {
    "Group": ["All Participants (Romance)", "Romance Lovers"],
    "Mean Rating": [mean_romance_all, mean_romance_choosers],
    "T-Statistic": [t_stat_romance, t_stat_romance],
    "P-Value": [p_value_romance, p_value_romance]
}

# Display romance results
romance_df = pd.DataFrame(romance_results)
tools.display_dataframe_to_user(name="Mean Ratings For Romance Videos", dataframe=romance_df)
