In [7]:
import jieba
import pandas as pd
import re

In [8]:
df_train= pd.read_excel('data_3500.xlsx')

In [9]:
stopwords=[]

with open('stopwords.txt','r',encoding = 'utf8') as f:
    for w in f:
        stopwords.append(w.strip())
        
def load_corpus(path):
    """
    加载语料库
    """
    data = []
    with open(path, "r", encoding="utf8") as f:
        for line in f:
            [_, seniment, content] = line.split(",", 2)
            content = processing(content)
            data.append((content, int(seniment)))
    return data


def load_corpus_bert(path):
    """
    加载语料库
    """
    data = []
    with open(path, "r", encoding="utf8") as f:
        for line in f:
            [_, seniment, content] = line.split(",", 2)
            content = processing_bert(content)
            data.append((content, int(seniment)))
    return data

def get_stopword_list(file):
    with open(file,'r',encoding = 'utf-8') as f:
        stopword_list = [word.strip('\n') for word in f.readlines()]
        return stopword_list
    
def clean_stopword(str, stopword_list):
    result = ''
    word_list = jieba.lcut(str)
    for w in word_list:
        if w not in stopword_list:
            result += w
    return result
    
def processing(text):
    """
    数据预处理, 可以根据自己的需求进行重载
    """
    # 数据清洗部分
    text = re.sub("\{%.+?%\}", " ", text)           # 去除 {%xxx%} (地理定位, 微博话题等)
    text = re.sub("@.+?( |$)", " ", text)           # 去除 @xxx (用户名)
    text = re.sub("【.+?】", " ", text)              # 去除 【xx】 (里面的内容通常都不是用户自己写的)
    text = re.sub("\u200b", " ", text)              # '\u200b'是这个数据集中的一个bad case, 不用特别在意
    # 分词
    words = [w for w in jieba.lcut(text) if w.isalpha()]
    # 对否定词`不`做特殊处理: 与其后面的词进行拼接
    while "不" in words:
        index = words.index("不")
        if index == len(words) - 1:
            break
        words[index: index+2] = ["".join(words[index: index+2])]  # 列表切片赋值的酷炫写法
    # 用空格拼接成字符串
    result = " ".join(words)
    return result


def processing_bert(text):
    """
    数据预处理, 可以根据自己的需求进行重载
    """
    # 数据清洗部分
    text = re.sub("\{%.+?%\}", " ", text)           # 去除 {%xxx%} (地理定位, 微博话题等)
    text = re.sub("@.+?( |$)", " ", text)           # 去除 @xxx (用户名)
    text = re.sub("【.+?】", " ", text)              # 去除 【xx】 (里面的内容通常都不是用户自己写的)            # '\u200b'是这个数据集中的一个bad case, 不用特别在意
    return text


In [10]:
for i in range(0,len(df_train)):
    df_train['context'][i] = processing(df_train['context'][i])
    df_train['context'][i] = clean_stopword(df_train['context'][i],stopwords)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train['context'][i] = processing(df_train['context'][i])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train['context'][i] = clean_stopword(df_train['context'][i],stopwords)


In [11]:
df_train.to_csv('data_cleaned.csv',encoding='utf_8_sig')
df_train = df_train.drop(df_train[df_train['emotion'] == 0].index)
df_train = df_train.dropna(axis=0, how = 'all')
df_train

Unnamed: 0,context,time,emotion
2,肖战 期待 冬奥 赛场 抹 中国 红 加油 加油,2022-02-04 08:58,1.0
4,北京 冬奥会 闭幕式 期待 下次 冬奥,2022-02-20 19:59,1.0
5,冬奥 开幕式 骂 偷国 选手 想 美 疫情 思考 热带 国家 震撼 中国 魂 狠...,2022-02-04 23:31,-1.0
6,今年冬天 恨不能 国内 想 环球 影城 想 yyqx 电影 更想 冬奥 疫...,2022-02-04 22:55,-1.0
7,冬奥 黑 那下 一届 米兰 极有 牌 不到,2022-02-20 20:37,-1.0
...,...,...,...
3402,爱 运动 中国 安踏 冬奥 加油 冰雪 运动 健儿 加油,2022-02-04 11:59,1.0
3404,今日 立春 冬奥 开幕 美国队 进场 走 太 散漫,2022-02-04 20:59,-1.0
3406,冰雪 温度 完 感觉 励志 冰雪 运动 魅力 真的 敬佩 年龄 ...,2022-02-04 15:59,1.0
3407,参加 北京 冬奥 闭幕式 运动员 数 破纪录 b 疫情,2022-02-20 20:04,-1.0


In [6]:
df_train.to_csv('cleaned2.csv',encoding = "utf_8_sig")

In [44]:
train_data = df_train[['context','emotion']][0:1500]

In [45]:
test_data = df_train[['context','emotion']][1501:1719]

In [46]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(token_pattern='\[?\w+\]?', 
                             stop_words=stopwords)
X_train = vectorizer.fit_transform(train_data["context"])
y_train = train_data["emotion"]



In [47]:
X_test = vectorizer.transform(test_data["context"])
y_test = test_data["emotion"]

In [48]:
from sklearn import svm

clf = svm.SVC()
clf.fit(X_train, y_train)

SVC()

In [54]:
y_pred = clf.predict(X_test)
y_pred

array([ 1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
        1.,  1.,  1.,  1.,  1.,  1., -1.,  1.,  1.,  1.,  1.,  1.,  1.,
        1.,  1.,  1.,  1.,  1.,  1.,  1., -1.,  1., -1.,  1.,  1.,  1.,
        1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1., -1.,  1.,  1.,  1.,
        1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
        1.,  1.,  1., -1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
        1.,  1.,  1.,  1.,  1., -1.,  1.,  1., -1.,  1.,  1.,  1.,  1.,
        1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1., -1.,  1.,
        1.,  1.,  1.,  1.,  1.,  1., -1.,  1.,  1., -1.,  1.,  1., -1.,
        1.,  1.,  1.,  1.,  1.,  1., -1.,  1.,  1.,  1.,  1.,  1.,  1.,
        1.,  1.,  1.,  1.,  1., -1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
        1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
        1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
        1., -1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1

In [50]:
def evaluate(output, label) -> float:
    # Use the code below to obtain the accuracy of your algorithm
    error = float((output != label).sum()) * 1. / len(output)
    print('Error: {:2.4f}%'.format(100 * error))

    return error

In [55]:
evaluate(y_pred, y_test)

Error: 18.8073%


0.18807339449541285

In [56]:
from sklearn import metrics

print(metrics.classification_report(y_test, y_pred))
print("准确率:", metrics.accuracy_score(y_test, y_pred))

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').