In [None]:
import pandas as pd

df = pd.read_csv('data/train.tsv/train.tsv',sep='\t')
df.head()

In [None]:
df['Phrase'][0]

In [None]:
import string 

'''
    单词预处理，将单词全部小写，并且去除标点符号
'''
def preprocessing(phrase):
    lower = [phras.lower() for phras in phrase]    # 将字母全部小写
    no_punct = [text.translate(str.maketrans('','',string.punctuation)) for text in lower]   # 去掉标点符号
    sp = [text.split() for text in no_punct]
    res = [' '.join(lis) for lis in sp]

    return res

df['Phrase'] = preprocessing(df['Phrase'])
df.head()

In [None]:
df['Phrase'][0]

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader,Dataset,TensorDataset
import numpy as np


# 定义数据读取类
class MyDataset(Dataset):
    """
    步骤一：继承paddle.io.Dataset类
    """
    def __init__(self,df,vocab_path='data/glove.6B/glove.6B.300d.txt',word_len=300):
        """
        步骤二：实现构造函数，定义数据读取方式，划分训练和测试数据集
        """
        super(MyDataset, self).__init__()
        
        self.vocab_path = vocab_path
        self.wordlen = word_len
        _, _, self.word_to_vec_map = self.load_glove_embeddings()


        self.data =np.nan_to_num(np.array([self.sentence_to_avg(text) for text in df['Phrase']]),nan=0)


        self.label = [la for la in df['Sentiment']]


    def __getitem__(self, index):
        """
        步骤三:实现__getitem__方法,定义指定index时如何获取数据,并返回单条数据(训练数据，对应的标签)
        """
        d = torch.tensor(self.data[index],dtype=torch.float32)   
        l = torch.tensor(self.label[index],dtype=torch.long)

        return d,l

    def __len__(self):
        """
        步骤四:实现__len__方法:返回数据集总数目
        """
        return self.data.shape[0]
    

    # 将句子转换为向量
    def sentence_to_avg(self,sentence):
        words = sentence.lower().strip().split()
        
        avg = np.zeros(self.wordlen,)
        
        for w in words:
            if w in self.word_to_vec_map.keys():  # 如果不在词表里面，则该向量设置为全零
                avg += self.word_to_vec_map[w]
        
        avg = avg / len(words)
        
        return avg

    # 加载GloVe词嵌入
    def load_glove_embeddings(self):
        with open(self.vocab_path, 'r', encoding='utf-8') as f:
            words = set()
            word_to_vec_map = {}
            
            for line in f:
                line = line.strip().split()
                curr_word = line[0]
                words.add(curr_word)
                word_to_vec_map[curr_word] = np.array(line[1:], dtype=np.float32)
            
            i = 1
            words_to_index = {}
            index_to_words = {}
            for w in sorted(words):
                words_to_index[w] = i
                index_to_words[i] = w
                i = i + 1
        return words_to_index, index_to_words, word_to_vec_map

In [None]:
from sklearn.model_selection import train_test_split

train_df,dev_df = train_test_split(df,test_size=0.1,random_state=42)

In [None]:
train_dataset = MyDataset(train_df)
train_loader = DataLoader(train_dataset,32,shuffle=True)

dev_dataset = MyDataset(dev_df)
dev_loader = DataLoader(dev_dataset,32,shuffle=True)

In [None]:
cnt = 0
for trains, labels in train_dataset:
    if cnt == 0:

        print(trains)
        print(labels)
    cnt += 1

训练SVM模型

In [None]:
from sklearn.svm import SVC

clf = SVC()

clf.fit(train_dataset.data,train_dataset.label)


使用SVM模型预测结果

In [None]:
#train_pred = clf.predict(train_dataset.data)
dev_pred = clf.predict(dev_dataset.data)

In [None]:
from sklearn import metrics

#train_acc = metrics.accuracy_score(train_dataset.label,train_pred)
dev_acc = metrics.accuracy_score(dev_dataset.label,dev_pred)

In [None]:
dev_acc

In [None]:
df = pd.read_csv('data/test.tsv/test.tsv',sep='\t')
df['Phrase'] = preprocessing(df['Phrase'])

test_dataset = MyDataset(df)
test_pred = clf.predict(test_dataset.data)

In [None]:
Submission_path = ''
submission = pd.read_csv(Submission_path)
submission.Sentiment = test_pred

In [None]:
# kaggle score  0.627
submission.to_csv('my_submission.csv',index=False)