In [58]:
#导入数据处理的基础包
import numpy as np
import pandas as pd

#导入用于计数的包
from collections import Counter

#导入tf-idf相关的包
from sklearn.feature_extraction.text import TfidfTransformer    
from sklearn.feature_extraction.text import CountVectorizer

#导入模型评估的包
from sklearn import metrics


#包tqdm是用来对可迭代对象执行时生成一个进度条用以监视程序运行过程
from tqdm import tqdm


import os

### 1. 读取数据并做文本的处理
你需要完成以下几步操作：
- 去掉无用的字符如！&，可自行定义
- 中文分词
- 去掉低频词

In [10]:
#读取数据
data = pd.read_csv('Corona_NLP_train.csv')
#观察数据格式
data.head()

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment
0,3799,48751,London,16-03-2020,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral
1,3800,48752,UK,16-03-2020,advice Talk to your neighbours family to excha...,Positive
2,3801,48753,Vagabonds,16-03-2020,Coronavirus Australia: Woolworths to give elde...,Positive
3,3802,48754,,16-03-2020,My food stock is not the only one which is emp...,Positive
4,3803,48755,,16-03-2020,"Me, ready to go at supermarket during the #COV...",Extremely Negative


In [3]:
data['OriginalTweet']

0        @MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...
1        advice Talk to your neighbours family to excha...
2        Coronavirus Australia: Woolworths to give elde...
3        My food stock is not the only one which is emp...
4        Me, ready to go at supermarket during the #COV...
                               ...                        
41152    Airline pilots offering to stock supermarket s...
41153    Response to complaint not provided citing COVI...
41154    You know its getting tough when @KameronWilds...
41155    Is it wrong that the smell of hand sanitizer i...
41156    @TartiiCat Well new/used Rift S are going for ...
Name: OriginalTweet, Length: 41157, dtype: object

In [6]:
data['Sentiment'].unique()

array(['Neutral', 'Positive', 'Extremely Negative', 'Negative',
       'Extremely Positive'], dtype=object)

In [4]:
#输出数据的一些相关信息
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41157 entries, 0 to 41156
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   UserName       41157 non-null  int64 
 1   ScreenName     41157 non-null  int64 
 2   Location       32567 non-null  object
 3   TweetAt        41157 non-null  object
 4   OriginalTweet  41157 non-null  object
 5   Sentiment      41157 non-null  object
dtypes: int64(2), object(4)
memory usage: 1.9+ MB


In [5]:
#只保留数据中我们需要的两列：Comment列和Star列
data = data[['OriginalTweet','Sentiment']]
#观察新的数据的格式
data.head()

Unnamed: 0,OriginalTweet,Sentiment
0,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral
1,advice Talk to your neighbours family to excha...,Positive
2,Coronavirus Australia: Woolworths to give elde...,Positive
3,My food stock is not the only one which is emp...,Positive
4,"Me, ready to go at supermarket during the #COV...",Extremely Negative


In [11]:
# 这里的star代表具体的评分。但在这个项目中，我们要预测的是正面还是负面。我们把评分为1和2的看作是负面，把评分为3，4，5的作为正面
data['Sentiment']=data.Sentiment.apply(lambda x:1 if x in [
    'Positive','Extremely Positive','Neutral'] else 0)
data.head()

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment
0,3799,48751,London,16-03-2020,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,1
1,3800,48752,UK,16-03-2020,advice Talk to your neighbours family to excha...,1
2,3801,48753,Vagabonds,16-03-2020,Coronavirus Australia: Woolworths to give elde...,1
3,3802,48754,,16-03-2020,My food stock is not the only one which is emp...,1
4,3803,48755,,16-03-2020,"Me, ready to go at supermarket during the #COV...",0


#### 任务1： 去掉一些无用的字符

In [17]:
# TODO1: 去掉一些无用的字符，自行定一个字符几何，并从文本中去掉
#    your to do "
na_words = r'"."。&!——“”！,\'?,=＝……·'
def clean(str):
  for c in na_words:
    str = str.replace(c,'')
  return str
data['OriginalTweet'] = data['OriginalTweet'].apply(clean)

In [18]:
data['OriginalTweet'].head(50).tail(10)

40    Were here to provide a safe shopping experienc...
41    Curious  do we think retail shoppers will do a...
42    CHECK VIDEO  https://tco/1ksn9Brl02 No food  i...
43    Breaking Story: Online clothes shopping rises ...
44    This is the line outside  @Target  in as custo...
45    South Africans stock up on food basic goods as...
46     Please Share  Know someone who s 65 Living on...
47    People posting and sharing photos of of half t...
48    Never thought Id say this but 2019 Will you co...
49    COVID-19 restrictions sparking a run on cannab...
Name: OriginalTweet, dtype: object

In [19]:
data['comment_processed'] = data['OriginalTweet'].apply(
    lambda x:x.split()
)

In [20]:
# 观察新的数据的格式
data.head()

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment,comment_processed
0,3799,48751,London,16-03-2020,@MeNyrbie @Phil_Gahan @Chrisitv https://tco/iF...,1,"[@MeNyrbie, @Phil_Gahan, @Chrisitv, https://tc..."
1,3800,48752,UK,16-03-2020,advice Talk to your neighbours family to excha...,1,"[advice, Talk, to, your, neighbours, family, t..."
2,3801,48753,Vagabonds,16-03-2020,Coronavirus Australia: Woolworths to give elde...,1,"[Coronavirus, Australia:, Woolworths, to, give..."
3,3802,48754,,16-03-2020,My food stock is not the only one which is emp...,1,"[My, food, stock, is, not, the, only, one, whi..."
4,3803,48755,,16-03-2020,Me ready to go at supermarket during the #COVI...,0,"[Me, ready, to, go, at, supermarket, during, t..."


In [37]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/yunwanxu/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [24]:
from nltk.corpus import stopwords


In [39]:
stopWords = stopwords.words('english')
print(stopWords)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

#### 设定停用词并去掉停用词

In [41]:


#print(stopWords)    
# 去除停用词
def rm_stop_word(wordList):
    # your code, remove stop words
    # TODO
   # if wordList:
    for w in stopWords:
      if w in wordList:
        wordList.remove(w)
    return wordList

#这行代码中.progress_apply()函数的作用等同于.apply()函数的作用，只是写成.progress_apply()函数才能被tqdm包监控从而输出进度条。
data['comment_processed'] = data['comment_processed'].apply(rm_stop_word)

In [42]:
# 观察新的数据的格式
data.head()

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment,comment_processed
0,3799,48751,London,16-03-2020,@MeNyrbie @Phil_Gahan @Chrisitv https://tco/iF...,1,"[@MeNyrbie, @Phil_Gahan, @Chrisitv, https://tc..."
1,3800,48752,UK,16-03-2020,advice Talk to your neighbours family to excha...,1,"[advice, Talk, neighbours, family, to, exchang..."
2,3801,48753,Vagabonds,16-03-2020,Coronavirus Australia: Woolworths to give elde...,1,"[Coronavirus, Australia:, Woolworths, give, el..."
3,3802,48754,,16-03-2020,My food stock is not the only one which is emp...,1,"[My, food, stock, one, is, empty, PLEASE, dont..."
4,3803,48755,,16-03-2020,Me ready to go at supermarket during the #COVI...,0,"[Me, ready, go, supermarket, #COVID19, outbrea..."


#### 去掉低频词，出现次数少于10次的词去掉

In [43]:

lowfreq_words = []
merged_list = [w for L in data['comment_processed'] for w in L ]
def remove_low_freq(word_list):
  for w in word_list:
    if merged_list.count(w)<10:
      word_list.remove(w)
  return ' '.join(word_list)
data['comment_processed'] =  data['comment_processed'].apply(rm_stop_word)

In [44]:
data['comment_processed'] =  data['comment_processed'].apply(lambda x:' '.join(x))

In [45]:
data.head()

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment,comment_processed
0,3799,48751,London,16-03-2020,@MeNyrbie @Phil_Gahan @Chrisitv https://tco/iF...,1,@MeNyrbie @Phil_Gahan @Chrisitv https://tco/iF...
1,3800,48752,UK,16-03-2020,advice Talk to your neighbours family to excha...,1,advice Talk neighbours family exchange phone n...
2,3801,48753,Vagabonds,16-03-2020,Coronavirus Australia: Woolworths to give elde...,1,Coronavirus Australia: Woolworths give elderly...
3,3802,48754,,16-03-2020,My food stock is not the only one which is emp...,1,My food stock one empty PLEASE dont panic THER...
4,3803,48755,,16-03-2020,Me ready to go at supermarket during the #COVI...,0,Me ready go supermarket #COVID19 outbreak Not ...


### 2. 把文本分为训练集和测试集
选择语料库中的20%作为测试数据，剩下的作为训练数据

In [46]:
data=data.sample(frac=1).reset_index(drop=True)

In [47]:
# TODO5: 把数据分为训练集和测试集. comments_train（list)保存用于训练的文本，comments_test(list)保存用于测试的文本。 y_train, y_test是对应的标签（0、1）

test_ratio = 0.2
n=int(data.shape[0]*(1-test_ratio))
train,test = data.iloc[:n,],data.iloc[n:,]
comments_train, comments_test = train['comment_processed'],test['comment_processed']
y_train, y_test = train['Sentiment'],test['Sentiment']

### 3. 把文本转换成向量的形式



#### 把文本转换成tf-idf向量

In [48]:
# TODO6: 把训练文本和测试文本转换成tf-idf向量。使用sklearn的feature_extraction.text.TfidfTransformer模块
#    请留意fit_transform和transform之间的区别。 常见的错误是在训练集和测试集上都使用 fit_transform，需要避免！ 
#    另外，可以留意一下结果是否为稀疏矩阵
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
vocab = list(set(merged_list))
pipe = Pipeline([('count', CountVectorizer(vocabulary=vocab)),
                  ('tfid', TfidfTransformer())]).fit(comments_train)
pipe['count'].transform(comments_train).toarray()
pipe['tfid'].idf_

tfidf_train=pipe.transform(comments_train)
tfidf_test=pipe.transform(comments_test)
print (tfidf_train.shape, tfidf_test.shape)

(32925, 112138) (8232, 112138)


### 4. 训练模型以及评估


In [49]:
# 导入逻辑回归的包
from sklearn.linear_model import LogisticRegression

In [53]:
from sklearn.model_selection import GridSearchCV

#### 使用tf-idf，并结合逻辑回归训练模型

In [54]:
%%time
import warnings
warnings.filterwarnings("ignore")
params = {
    'penalty': ['l1','l2'],
    'C':list(range(10)),
 #   'solver':['newton-cg','lbfgs','liblinear','sag','saga']
}
lr = LogisticRegression(random_state=0)
clf = GridSearchCV(lr, params,n_jobs=-1)
clf.fit(tfidf_train, y_train)
clf.best_params_

CPU times: user 2.4 s, sys: 1.09 s, total: 3.49 s
Wall time: 39.2 s


{'C': 9, 'penalty': 'l2'}

In [56]:

tf_idf_y_pred = clf.predict(tfidf_test)

In [57]:
#使用tf-idf + 逻辑回归训练模型，需要用gridsearchCV做交叉验证，并选择最好的超参数



print('TF-IDF LR test accuracy %s' % metrics.accuracy_score(y_test, tf_idf_y_pred))
#逻辑回归模型在测试集上的F1_Score
print('TF-IDF LR test F1_score %s' % metrics.f1_score(y_test, tf_idf_y_pred,average="macro"))

TF-IDF LR test accuracy 0.8720845481049563
TF-IDF LR test F1_score 0.8609963383856425
