# Import

https://www.kaggle.com/competitions/feedback-prize-effectiveness/overview

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt


# PD Setting


In [2]:
# 显示所有列
pd.set_option('display.max_columns', None)
# 显示所有行
pd.set_option('display.max_rows', None)
#显示宽度
pd.set_option('display.width', 2000)
pd.set_option('display.max_colwidth',2000 )
pd.set_option('display.unicode.ambiguous_as_wide', True)
pd.set_option('display.unicode.east_asian_width', True)

# Load Data

In [3]:
#load data
train_path ="./data/train.csv"
test_path="./data/test.csv"
train_data = pd.read_csv(train_path)
test_data =pd.read_csv(test_path)
print('实验数据大小:',train_data.shape)
print('预测数据大小:',test_data.shape)

实验数据大小: (36765, 5)
预测数据大小: (10, 4)


# EDA

* discourse_id - ID code for discourse element 
* essay_id - ID code for essay response. This ID code corresponds to the name of the full-text file in the train/ folder.
* discourse_text - Text of discourse element. 
* discourse_type - Class label of discourse element. 
* discourse_type_num - Enumerated class label of discourse element . 
* discourse_effectiveness - Quality rating of discourse element, the target.

In [4]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36765 entries, 0 to 36764
Data columns (total 5 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   discourse_id             36765 non-null  object
 1   essay_id                 36765 non-null  object
 2   discourse_text           36765 non-null  object
 3   discourse_type           36765 non-null  object
 4   discourse_effectiveness  36765 non-null  object
dtypes: object(5)
memory usage: 1.4+ MB


In [5]:
train_data.describe()

Unnamed: 0,discourse_id,essay_id,discourse_text,discourse_type,discourse_effectiveness
count,36765,36765,36765,36765,36765
unique,36765,4191,36691,7,3
top,0013cc385424,91B1F82B2CF1,Summer projects should be student-designed,Evidence,Adequate
freq,1,23,14,12105,20977


display 查看所有columns

* Lead - an introduction that begins with a statistic, a quotation, a description, or some other device to grab the reader’s attention and point toward the thesis
* Position - an opinion or conclusion on the main question
* Claim - a claim that supports the position
* Counterclaim - a claim that refutes another claim or gives an opposing reason to the position
* Rebuttal - a claim that refutes a counterclaim
* Evidence - ideas or examples that support claims, counterclaims, or rebuttals.
* Concluding Statement - a concluding statement that restates the claims

In [15]:
train_data['discourse_type'].value_counts(normalize=True)

discourse_type
Evidence                0.329253
Claim                   0.325772
Position                0.109452
Concluding Statement    0.091146
Lead                    0.062315
Counterclaim            0.048225
Rebuttal                0.033837
Name: proportion, dtype: float64

Our task is to predict the quality rating of each discourse element. Human readers rated each rhetorical or argumentative element, in order of increasing quality, as one of :

* Ineffective
* Adequate
* Effective

In [6]:
train_data['discourse_effectiveness'].value_counts(normalize=True)

discourse_effectiveness
Adequate       0.570570
Effective      0.253665
Ineffective    0.175765
Name: proportion, dtype: float64

head()

用于显示 DataFrame 的前几行数据的方法

In [7]:
train_data.head(5)

Unnamed: 0,discourse_id,essay_id,discourse_text,discourse_type,discourse_effectiveness
0,0013cc385424,007ACE74B050,"Hi, i'm Isaac, i'm going to be writing about how this face on Mars is a natural landform or if there is life on Mars that made it. The story is about how NASA took a picture of Mars and a face was seen on the planet. NASA doesn't know if the landform was created by life on Mars, or if it is just a natural landform.",Lead,Adequate
1,9704a709b505,007ACE74B050,"On my perspective, I think that the face is a natural landform because I dont think that there is any life on Mars. In these next few paragraphs, I'll be talking about how I think that is is a natural landform",Position,Adequate
2,c22adee811b6,007ACE74B050,I think that the face is a natural landform because there is no life on Mars that we have descovered yet,Claim,Adequate
3,a10d361e54e4,007ACE74B050,"If life was on Mars, we would know by now. The reason why I think it is a natural landform because, nobody live on Mars in order to create the figure. It says in paragraph 9, ""It's not easy to target Cydonia,"" in which he is saying that its not easy to know if it is a natural landform at this point. In all that they're saying, its probably a natural landform.",Evidence,Adequate
4,db3e453ec4e2,007ACE74B050,People thought that the face was formed by alieans because they thought that there was life on Mars.,Counterclaim,Adequate


isnull().sum()

用来表示数据中的缺失值

In [8]:
missing_values = train_data.isna().sum()
# 或者
missing_values = train_data.isnull().sum()

# 查看缺失值
missing_values[missing_values > 0]


# 查看某列包含缺失值（NaN）的前几条记录
# train_data[train_data['column_name'].isnull()].head()

Series([], dtype: int64)

.nunique()

用于返回对象中除 NA/null 值外的不同(unique)的非重复值的数量。

In [9]:
train_data.nunique()

discourse_id               36765
essay_id                    4191
discourse_text             36691
discourse_type                 7
discourse_effectiveness        3
dtype: int64

.unique()

查看某列里，所有不同的（唯一）值

.corr()

* 'pearson'：标准的皮尔逊相关系数。
* 'kendall'：Kendall Tau 相关系数。
* 'spearman'：Spearman 秩相关系数。

The distribution of categories

Check the proportion of missing values.

Visualize the relationship between features

View Distribution of each type of data

In [None]:

#正负数据平衡才有参考意义
df_majority = train_data[train_data['target'] == 0]
df_minority = train_data[train_data['target'] == 1]
df_majority_downsampled = df_majority.sample(n=len(df_minority), random_state=42)
train_data = pd.concat([df_majority_downsampled, df_minority], axis=0)



def ViewCategoricalData(data, cat_cols):
    f = pd.melt(data, id_vars=['target'], value_vars=cat_cols)
    g = sns.FacetGrid(f, col="variable", hue="target", col_wrap=3, sharex=False, sharey=False)
    g = g.map(sns.countplot, "value", alpha=0.6).add_legend()

    for ax in g.axes.flat:
        labels = ax.get_xticklabels()
        ax.set_xticks(range(len(labels)))
        ax.set_xticklabels(labels, rotation=45)

    plt.show()


def ViewContinuousData(data, num_cols):
    f = pd.melt(data, id_vars=['target'], value_vars=num_cols)
    g = sns.FacetGrid(f, col="variable", hue="target", col_wrap=3, sharex=False, sharey=False)
    g = g.map(sns.kdeplot, "value", alpha=0.6).add_legend()

# Data Process



* 数据清洗（Data Cleaning）：
  
处理缺失值：可以通过删除含有缺失值的记录、填充缺失值（如使用均值、中位数或众数填充）等方式进行。

去除重复值：识别并删除数据集中的重复记录。

纠正错误值：检查数据中的逻辑错误或异常值，并进行相应的修正。

* 数据转换（Data Transformation）：
  
标准化/归一化：将数值属性缩放到某个特定范围，比如0-1之间，或者将其标准化以拥有零均值和单位方差。

编码分类变量：如你之前提到的，可以使用LabelEncoder或OneHotEncoder对分类数据进行编码。

特征构造：基于现有特征创建新的有意义的特征。

* 数据集成（Data Integration）：
  
合并来自不同源的数据，确保一致性，解决实体识别问题，以及处理冗余和冲突的数据。

* 数据规约（Data Reduction）：
  
降维：通过主成分分析（PCA）、线性判别分析（LDA）等技术减少数据维度。

数量规约：使用参数模型或非参数模型来代替原始数据。

* 数据离散化（Data Discretization）：
  
将连续型数据转换为离散区间，便于某些算法处理。

* 数据分割（Data Splitting）：
  
将数据集划分为训练集、验证集和测试集，用于模型训练、调参和评估。

In [None]:
# 整合数据，可以一起处理
full_data = pd.concat([train_data,test_data],ignore_index=True)

In [None]:
# Data Cleaning

#用一个特定的常数来填充所有的缺失值
full_data.fillna(0, inplace=True) 

# 用特定字母
full_data['column_name'].fillna('U')

#使用上一个非缺失值来填充当前的缺失值。这种方法通常适用于时间序列数据
full_data.fillna(method='ffill', inplace=True)

#与前向填充相反，使用下一个非缺失值来填充当前的缺失值
full_data.fillna(method='bfill', inplace=True)

#均值/中位数/众数填充
# 使用均值填充
full_data['column_name'].fillna(full_data['column_name'].mean(), inplace=True)

# 使用中位数填充
full_data['column_name'].fillna(full_data['column_name'].median(), inplace=True)

# 使用众数填充
full_data['column_name'].fillna(full_data['column_name'].mode()[0], inplace=True)

#插值法（Interpolation） 特别适合于时间序列数据，可以根据已有的数据点估计缺失值
full_data.interpolate(method='linear', inplace=True)

#出现频率最高的值来填充所有的缺失值
full_data = full_data.fillna({'column_name': full_data['column_name'].value_counts().idxmax()})

#从 Cabin 列中提取每个客舱的第一个字符来表示甲板（Deck）信息，并创建一个新的列 Deck 来存储这些信息。
full_data['Deck'] = full_data['Cabin'].apply(lambda x: x[0] if x != 'Unknown' else 'Unknown')



In [None]:
# Data Drop

# 删除所有 column_name 列中值为 'some_value' 的行。
full_data.drop(full_data[full_data['column_name'] == 'some_value'].index, inplace=True)

# 删除含有任何缺失值的行
full_data.dropna(inplace=True)

# 删除含有任何缺失值的列
full_data.dropna(axis=1, inplace=True)

# 删除指定列
full_data.drop('column_name', axis=1, inplace=True)
full_data.drop(['column_name1', 'column_name2'], axis=1, inplace=True)

#基于条件删除列
thresh = len(full_data) * 0.6  # 至少需要60%的非空值
full_data.dropna(axis=1, thresh=thresh, inplace=True)

In [None]:
# Data Transformation
#  Rank Gauss Transformation
from scipy.special import erfinv

def rank_gauss(x):
    x = x.argsort().argsort() # rank
    x = (x/x.max()-0.5)*2 #转换尺度到[-1,1]
    epsilon=1e-6
    x = np.clip(x, -1+epsilon, 1-epsilon)
    x = erfinv(x) 
    return x

for col in FLOAT_COLS:
    if col in train_data.columns:
        print(col,train_data[col].mean(), train_data[col].std())
        train_data[col] = rank_gauss(train_data[col])

for col in FLOAT_COLS:
    if col in train_data.columns:
        print(col,test_data[col].mean(), test_data[col].std())
        test_data[col] = rank_gauss(test_data[col])

In [None]:
# Data Transformation   Min-Max 归一化
# 将数据缩放到指定的范围（通常为 [0, 1] 或 [-1, 1]）。
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(0, 1))  # 缩放到 [0, 1]
df_scaled = pd.DataFrame(scaler.fit_transform(train_data), columns=train_data.columns)
#在测试集上应用相同的参数。避免直接在测试集上拟合，以防止数据泄露
test_data_scaled = scaler.transform(test_data)

In [None]:
# Data Transformation   Z-Score 标准化
# 将数据缩放到指定的范围（通常为 [0, 1] 或 [-1, 1]）。
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
df_standardized = pd.DataFrame(scaler.fit_transform(train_data), columns=train_data.columns)

In [None]:
# Data Transformation   MaxAbs 归一化
# 将数据缩放到 [-1, 1] 范围，适合稀疏数据。
from sklearn.preprocessing import MaxAbsScaler
scaler = MaxAbsScaler()
df_maxabs = pd.DataFrame(scaler.fit_transform(train_data), columns=train_data.columns)

In [None]:
# Data Transformation
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
# 避免污染到原始数据
full_data =train_data.copy() 
# 对每个分类特征应用标签编码
for column in full_data.columns:
    if(full_data[column].dtype =='object'):
        full_data[column] = le.fit_transform(full_data[column])

In [None]:
# Data Discretization

# 对数变换 有助于减少数据中的偏斜(skewness)，特别是当存在大量小值和少量极大值时 使得数据分布更加接近正态分布
full_data['column_name'] = np.log1p(full_data['column_name'])

# 找出数值型特征（列），并计算这些特征的偏度（skewness）。然后，它会筛选出那些偏度绝对值大于1的特征，并准备对这些特征进行对数变换以减少数据偏斜
numeric_df = full_data.select_dtypes(['float64','int32','int64'])
numeric_cols = numeric_df.columns.tolist()
skewed_cols = full_data[numeric_cols].apply(lambda x: x.skew()).sort_values(ascending=False)
skewed_df = pd.DataFrame({'skew':skewed_cols})
skew_cols = skewed_df[skewed_df['skew'].abs()>1].index.tolist()
for col in skew_cols:
    # 避免对含有0或负数的列直接取对数，可以先加上一个最小正值
    min_val = full_data[col].min()
    if min_val <= 0:
        full_data[col] = full_data[col] - min_val + 1e-8  # 调整值域使所有值都大于0
    full_data[col] = np.log(full_data[col])


# 分箱（Binning）是一种将连续数据离散化的方法，可以减少异常值的影响，使模型更加稳定  
# 划分到5个等宽的区间（bins）
full_data['column_name'] = pd.cut(full_data['column_name'], bins=5, labels=False)


#按自己的需求划分区间
full_data.loc[full_data['column_name'] <= 7.91, 'column_name'] = 0
full_data.loc[(full_data['column_name'] > 7.91) & (full_data['column_name'] <= 14.454), 'column_name'] = 1
full_data.loc[(full_data['column_name'] > 14.454) & (full_data['column_name'] <= 31), 'column_name'] = 2
full_data.loc[full_data['column_name'] > 31, 'column_name'] = 3



In [None]:
# Data Split
from sklearn.model_selection import train_test_split

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42) 

Clean Text

In [None]:
import re
import string
import nltk
import pandas as pd
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
# retrieve english stop words
stop_words = set(stopwords.words('english'))

# Converts text to lower case
def convert_to_lowercase(text):
    if pd.isna(text):
        return text
    if isinstance(text, str):
        return text.lower()
    return text

# Remove all punctuation from the text
def remove_punctuation(text):
    if pd.isna(text):
        return text
    text = re.sub(f'[{string.punctuation}]', '', text)
    return text

# Removes all numbers from the text
def remove_numbers(text):
    if pd.isna(text):
        return text
    text = re.sub(r'\d+', '', text)
    return text

# Text segmentation, then remove the length of 2 or less and the single word and stop word
def remove_short_words_and_stop_words(text):
    if pd.isna(text):
        return text
    words = word_tokenize(text)
    words = [word for word in words if len(word) > 2 and word not in stop_words]
    cleaned_text = ' '.join(words)
    return cleaned_text

# Replace two or more consecutive Spaces with a single space
def remove_multiple_spaces(text):
    if pd.isna(text):
        return text
    cleaned_text = re.sub(r' {2,}', ' ', text)
    return cleaned_text

# Remove urls
def remove_urls(text):
    if pd.isna(text):
        return text
    url_pattern = r'(www.|http[s]?://)(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    return re.sub(url_pattern, '', text)

# Remove hmtmls
def remove_html(text):
    if pd.isna(text):
        return text
    html_entities = r'<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});'
    return re.sub(html_entities, '', text)

# Remove @ and #   
# 变成小写字母才有效 
def remove_tags(text):
    if pd.isna(text):
        return text
    tag_pattern = r'@([a-z0-9]+)|#'
    return re.sub(tag_pattern, '', text)

def remove_emoji(text):
    if pd.isna(text):
        return text
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

def preprocess_text(text):
    if pd.isna(text):
        return text
    cleaned_text = re.sub(r'[^a-zA-Z\d\s]+', '',text)
    word_list = []
    for each_word in cleaned_text.split(' '):
        word_list.append((each_word).lower())
    word_list = [
        WordNetLemmatizer().lemmatize(each_word.strip()) for each_word in word_list
        if each_word not in stop_words and each_word.strip() != ''
    ]
    return " ".join(word_list)

#将驼峰命名法（CamelCase）的字符串拆分为单词

def split_camel_case(text):
    if pd.isna(text):
        return text
    return re.sub(r'(?<!^)([A-Z])', r' \1', text)

# 形如 example.com 的域名替换为 example com  将域名中的点（.）替换为空格
def preserve_domain(text):
    if pd.isna(text):
        return text
    return re.sub(r'([a-zA-Z0-9]+)\.([a-zA-Z]{2,})', r'\1 \2', text)

def clean_text_bycol(df, col):
    df[col] = df[col].apply(split_camel_case)
    df[col] = df[col].apply(convert_to_lowercase)
    df[col] = df[col].apply(remove_urls)
    df[col] = df[col].apply(remove_html)
    df[col] = df[col].apply(remove_tags)
    df[col] = df[col].apply(remove_numbers)
    df[col] = df[col].apply(remove_short_words_and_stop_words)
    df[col] = df[col].apply(remove_emoji) 
    df[col] = df[col].apply(remove_multiple_spaces)
    df[col] = df[col].apply(preserve_domain)
    df[col] = df[col].apply(remove_punctuation) 
    return df

def clean_text(text):
    print(text)
    text =split_camel_case(text)
    text =convert_to_lowercase(text)
    text =remove_urls(text)
    text =remove_html(text)
    text =remove_tags(text)
    text =remove_numbers(text)
    text =remove_short_words_and_stop_words(text)
    text =remove_emoji(text)
    text =remove_multiple_spaces(text)
    text =preserve_domain(text)
    text =remove_punctuation(text)
    return text

# Model

# Evaluation

In [None]:
# Implement the evaluation metrics that we will use to assess our models

def accuracy(predicted_labels, true_labels):
    """
    Accuracy is correct predictions / all predicitons
    
    Args:
        predicted_labels (np.ndarray[int, 1]): the integer labels from the predictions. Uni-dimensional
        true_labels (np.ndarray[int, 1]): the integer labels from the gold standard. Uni-dimensional
    
    Returns:
        accuracy_value (double)
        
    """
    accuracy_value = 0.
    accuracy_value = predicted_labels[predicted_labels == true_labels].shape[0] / predicted_labels.shape[0]
    return accuracy_value

def precision(predicted_labels, true_labels):
    """
    Precision is True Positives / All Positives Predictions
    
    Args:
        predicted_labels (np.ndarray[int, 1]): the integer labels from the predictions. Uni-dimensional
        true_labels (np.ndarray[int, 1]): the integer labels from the gold standard. Uni-dimensional
    
    Returns:
        precision_value (double)
        
    """
    precision_value = 0.
    TP = np.sum((predicted_labels == 1) & (true_labels == 1))
    FP = np.sum((predicted_labels == 1) & (true_labels == 0))
    precision_value = TP / (TP + FP) if (TP + FP) > 0 else 0
    return precision_value

def recall(predicted_labels, true_labels):
    """
    Recall is True Positives / All Positive Labels
    
    Args:
        predicted_labels (np.ndarray[int, 1]): the integer labels from the predictions. Uni-dimensional
        true_labels (np.ndarray[int, 1]): the integer labels from the gold standard. Uni-dimensional
    
    Returns:
        recall_value (double)
        
    """
    recall_value = 0.
    TP = np.sum((predicted_labels == 1) & (true_labels == 1))
    FN = np.sum((predicted_labels == 0) & (true_labels == 1))
    recall_value = TP / (TP + FN) if (TP + FN) > 0 else 0
    return recall_value

def f1_score(predicted_labels, true_labels):
    """
    F1 score is the harmonic mean of precision and recall
    
    Args:
        predicted_labels (np.ndarray[int, 1]): the integer labels from the predictions. Uni-dimensional
        true_labels (np.ndarray[int, 1]): the integer labels from the gold standard. Uni-dimensional
    
    Returns:
        f1_score_value (double)
        
    """
    f1_score_value = 0.
    P = precision(predicted_labels, true_labels)
    R = recall(predicted_labels, true_labels)
    f1_score_value = 2 * P * R / (P + R) if (P + R) > 0 else 0
    return f1_score_value

In [None]:
from sklearn.metrics import confusion_matrix,accuracy_score, f1_score, precision_score, recall_score

cm = confusion_matrix(yTrue, yPred)

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='g')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

accuracy = accuracy_score(y_true, y_pred) 
print("Accuracy:", accuracy)

f1 = f1_score(y_true, y_pred)
print("F1 Score:", f1)

precision = precision_score(y_true, y_pred)
print("Precision Score:", precision)

recall = recall_score(y_true, y_pred)
print("Recall Score:", recall)