In [None]:
import pandas as pd

In [None]:
ofo = pd.read_csv("ofo.csv")

In [None]:
ofo.columns = ["title", "datetime", "content"]

In [None]:
ofo.head()

In [None]:
ofo.shape

In [None]:
mobike = pd.read_csv("mobike.csv")

In [None]:
mobike.columns = ["title", "datetime", "content"]

In [None]:
mobike.head()

In [None]:
mobike.shape

In [None]:
merged = pd.concat([ofo, mobike]).reset_index().drop('index', axis=1)

In [None]:
merged.head()

In [None]:
merged.shape

In [None]:
merged[merged.title.duplicated()]

In [None]:
merged = merged[~merged.title.duplicated()]

In [None]:
merged.shape

In [None]:
def count_ofo(mystr):
    mycount = mystr.count('OFO') +  mystr.count('ofo')
    return mycount

In [None]:
merged["count_ofo"] = merged.content.apply(count_ofo)

In [None]:
def count_mobike(mystr):
    mycount = mystr.count('摩拜') + mystr.count('mobike')
    return mycount

In [None]:
merged["count_mobike"] = merged.content.apply(count_mobike)

In [None]:
merged.head()

In [None]:
merged[merged.count_mobike > merged.count_ofo].shape

In [None]:
mobike = merged[merged.count_mobike > merged.count_ofo]

In [None]:
mobike = mobike.reset_index().drop(['index','count_ofo','count_mobike'], axis=1)

In [None]:
mobike.head()

In [None]:
mobike.shape

In [None]:
merged[merged.count_mobike <= merged.count_ofo].shape

In [None]:
ofo = merged[merged.count_mobike <= merged.count_ofo]

In [None]:
ofo = ofo.reset_index().drop(['index','count_ofo','count_mobike'], axis=1)

In [None]:
ofo.head()

In [None]:
ofo.shape

# Sentiment Analysis

## snownlp

In [None]:
from snownlp import SnowNLP
def get_sentiment(text):
    return SnowNLP(text).sentiments

In [None]:
ofo["sentiment"] = ofo.content.apply(get_sentiment)
mobike["sentiment"] = mobike.content.apply(get_sentiment)

In [None]:
ofo[ofo.sentiment > 0.5].shape

In [None]:
ofo[ofo.sentiment <= 0.5].shape

In [None]:
mobike[mobike.sentiment > 0.5].shape

In [None]:
mobike[mobike.sentiment <= 0.5].shape

## bosonnlp

In [None]:
# from bosonnlp import BosonNLP

# import json

# with open("boson_api_key.json") as f:
#     secret = json.load(f)

# nlp = BosonNLP(secret["boson_api"])

# def get_sentiment_bosonnlp(text):
#     return nlp.sentiment(text, model="news")[0][0]

# ofo["sentiment"] = ofo.content.apply(get_sentiment_bosonnlp)

# mobike["sentiment"] = mobike.content.apply(get_sentiment_bosonnlp)

In [None]:
import pickle

In [None]:
# with open('sentiment_bosonnlp.pickle', 'wb') as f:
#     pickle.dump([ofo, mobike], f)

In [None]:
with open('sentiment_bosonnlp.pickle', 'rb') as f:
    [ofo, mobike] = pickle.load(f)

In [None]:
ofo[ofo.sentiment > 0.5].shape

In [None]:
ofo[ofo.sentiment <= 0.5].shape

In [None]:
mobike[mobike.sentiment > 0.5].shape

In [None]:
mobike[mobike.sentiment <= 0.5].shape

# Visualization

In [None]:
%matplotlib inline

In [None]:
ofo.set_index('datetime', inplace=True)

In [None]:
mobike.set_index('datetime', inplace=True)

In [None]:
(ofo[['sentiment']] - 0.5).plot(kind='bar', figsize=(24, 6))

In [None]:
(mobike[['sentiment']] - 0.5).plot(kind='bar', figsize=(24, 6))

In [None]:
import seaborn as sns

In [None]:
ofo["brand"] = "ofo"

In [None]:
mobike["brand"] = "mobike"

In [None]:
ofo.head()

In [None]:
df = pd.concat([ofo, mobike])

In [None]:
sns.boxplot(x='brand', y='sentiment', data=df)

In [None]:
ofo_positive = ofo[ofo.sentiment > 0.5]
ofo_negative = ofo[ofo.sentiment <= 0.5]
mobike_positive = mobike[mobike.sentiment > 0.5]
mobike_negative = mobike[mobike.sentiment <= 0.5]

In [None]:
ofo_positive.shape

In [None]:
ofo_negative.shape

In [None]:
mobike_positive.shape

In [None]:
mobike_negative.shape

# LDA

In [None]:
import pyLDAvis 
import pyLDAvis.sklearn 

In [None]:
pyLDAvis.enable_notebook()

In [None]:
from helper import *

In [None]:
stopwords = get_custom_stopwords("stopwordsHIT.txt") # HIT停用词词典
max_df = 0.7 # 在超过这一比例的文档中出现的关键词（过于平凡），去除掉。
min_df = 2 # 在低于这一数量的文档中出现的关键词（过于独特），去除掉。
n_features = 1000 # 最大提取特征数量
n_top_words = 20 # 显示主题下关键词的时候，显示多少个
col_content = "content" # 说明其中的文本信息所在列名称

In [None]:
def lda_on_chinese_articles(df, n_topics):
    return lda_on_chinese_articles_with_param(df, n_topics, 
                            col_content = col_content, 
                            stopwords = stopwords, 
                            n_features = n_features, 
                            max_df = max_df, 
                            min_df = min_df,
                            n_top_words = n_top_words)

In [None]:
df = ofo_positive
n_topics = 4
lda, tf, vect = lda_on_chinese_articles(df = df, n_topics = n_topics)
pyLDAvis.sklearn.prepare(lda, tf, vect)

In [None]:
df = ofo_negative
n_topics = 3
lda, tf, vect = lda_on_chinese_articles(df = df, n_topics = n_topics)
pyLDAvis.sklearn.prepare(lda, tf, vect)

In [None]:
df = mobike_positive
n_topics = 4
lda, tf, vect = lda_on_chinese_articles(df = df, n_topics = n_topics)
pyLDAvis.sklearn.prepare(lda, tf, vect)

In [None]:
df = mobike_negative
n_topics = 3
lda, tf, vect = lda_on_chinese_articles(df = df, n_topics = n_topics)
pyLDAvis.sklearn.prepare(lda, tf, vect)