In [1]:
#####Question 3B
import pandas as pd
import jieba
import re
from collections import Counter
import string

raw_train_csv = pd.read_csv('offsite-tagging-training-set (1).csv')


print(raw_train_csv['tags'].unique())
soccer_news = raw_train_csv[raw_train_csv['tags'] == '足球']
cy_news = raw_train_csv[raw_train_csv['tags'] == '梁振英']
election_news = raw_train_csv[raw_train_csv['tags'] == '美國大選']

def remove_unwanted_characters(text):
    result = re.sub("<[^<]+?>", "", text)
    return result

soccer_doc = remove_unwanted_characters('\n'.join(soccer_news['text'].tolist()))
cy_doc = remove_unwanted_characters('\n'.join(cy_news['text'].tolist()))
election_doc = remove_unwanted_characters('\n'.join(election_news['text'].tolist()))


def remove_stopwords(word_list,stopwords):
    result = [i for i in word_list if i not in stopwords and i not in string.punctuation]
    chi_punc = "[\s+\.\!\/_,$%^*(+\"\']+|[+——！，。？、~@#￥%……&*（）]+"
    chi_punc2 = "[【】╮╯▽╰╭★→「」]+"
    chi_punc3 = "！，❤。～《》：（）【】「」？”“；：、"
    
    result = [i for i in result if (len(re.findall(chi_punc,i)) == 0) and\
                                      (len(re.findall(chi_punc2,i)) == 0) and\
                                      (len(re.findall(chi_punc3,i)) == 0)]
    
    return result
    
with open('stopwords.txt') as f:
    stopwords = f.read().split('\n')
    soccer_words = remove_stopwords(list( jieba.cut(soccer_doc) ) , stopwords)
    cy_words = remove_stopwords(list( jieba.cut(cy_doc) ) , stopwords)
    election_words = remove_stopwords( list( jieba.cut(election_doc) ) , stopwords)
    all_words = list(set(soccer_words).union(set(cy_words)).union(set(election_words) ) )
    
soccer_counts = Counter(soccer_words)
cy_counts = Counter(cy_words)
election_counts = Counter(election_words)
counts_df = pd.DataFrame([dict(soccer_counts),\
                          dict(cy_counts),\
                          dict(election_counts)]).fillna(0)

from sklearn.feature_extraction.text import TfidfTransformer
transformer = TfidfTransformer(smooth_idf=True)
tfidf = transformer.fit_transform(counts_df)
tfidf_df = pd.DataFrame(tfidf.toarray(),columns=counts_df.columns.tolist())

soccer_importance = tfidf_df.iloc[0].sort_values()
cy_importance = tfidf_df.iloc[1].sort_values()
election_importance = tfidf_df.iloc[2].sort_values()

soccer_important_words = soccer_importance[soccer_importance > 0]
cy_important_words = cy_importance[cy_importance > 0]
election_important_words = election_importance[election_importance > 0]

print('finish ranking important words')

Building prefix dict from the default dictionary ...


['足球' '梁振英' '美國大選']


Dumping model to file cache /tmp/jieba.cache
Loading model cost 1.082 seconds.
Prefix dict has been built succesfully.


finish ranking important words


In [14]:
######Question 3b exploration top 500 keywords
import math
from sklearn.linear_model import LogisticRegression
from collections import Counter
import stats_utils

total_soccer_keywords = 150
total_cy_keywords = 150
total_election_keywords = 150

soccer_keywords = soccer_important_words.tail(total_soccer_keywords).index.tolist()
cy_keywords = cy_important_words.tail(total_cy_keywords).index.tolist()
election_keywords = election_important_words.tail(total_election_keywords).index.tolist()

keywords = list(set(soccer_keywords + cy_keywords + election_keywords))




def extract_features(text,binary=True):
    result = Counter()
    for kw in keywords:
        found = re.findall(kw,text)
        result[kw] = len(found)
    result = pd.Series(result)
    if binary:
        result = result.apply(lambda freq: 1 if freq > 0 else 0)
    return result

raw_train_csv = pd.read_csv('offsite-tagging-training-set (1).csv')
raw_validation_csv = pd.read_csv('offsite-tagging-test-set (1).csv')


#The provided test set csv does not have "tags" column.
#Hence, I wrote a simple scrapper to obtain "tags" and stored them into raw_validation_csv.csv.
#I included the scrapper implementation below.

raw_validation_csv = pd.read_csv('raw_validation_csv.csv')
raw_validation_csv['tags'] = raw_validation_csv['tags'].apply(lambda tags: eval(tags)[0])

#####text to features
print('feature extraction starts')
training_X = raw_train_csv['text'].apply(lambda text: extract_features(text,binary=True)).fillna(0)
training_Y = raw_train_csv['tags']
validation_X = raw_validation_csv['text'].apply(lambda text: extract_features(text,binary=True)).fillna(0)
print('feature extraction ends')
all_possible_outputs=training_Y.unique().tolist()

#####training
clf = LogisticRegression(random_state=0)
clf.fit(training_X,training_Y)

####decision tree
# from sklearn.tree import DecisionTreeClassifier, export_graphviz
# from sklearn import tree
# from sklearn.datasets import load_wine
# from IPython.display import SVG
# from graphviz import Source
# import pydotplus
# from IPython.display import display

# dt_clf = DecisionTreeClassifier(random_state=0,min_samples_leaf=0.01)
# dt_clf.fit(training_X,training_Y)


# graph = Source(tree.export_graphviz(dt_clf, out_file=None,\
#                                     feature_names=training_X.columns.tolist(),\
#                                     class_names=dt_clf.classes_,\
#                                     filled = True))
# display(SVG(graph.pipe(format='svg')))


####validate on training set
predicted_Y = clf.predict(training_X)
t_stats_df,t_cm = stats_utils.generate_statistic(predicted_Y,training_Y,all_possible_outputs=all_possible_outputs)
print(t_stats_df)
print(t_cm)

####validate on validation set
predicted_Y = clf.predict(validation_X)
validation_Y = raw_validation_csv['tags']
raw_validation_csv['predicted_tags'] = predicted_Y.tolist()
v_stats_df,v_cm = stats_utils.generate_statistic(predicted_Y,validation_Y,all_possible_outputs=all_possible_outputs)
print()
print(predicted_Y.tolist())
print()

print(v_stats_df)
print(v_cm)


feature extraction starts
feature extraction ends
                                                              0
output_label=梁振英|cases_satisfying_antecedent_count   929.000000
output_label=梁振英|confidence                            1.000000
output_label=梁振英|correctly_classified_count          929.000000
output_label=梁振英|incorrectly_classified_count          0.000000
output_label=梁振英|lift                                  4.191604
output_label=梁振英|support                               0.238572
output_label=美國大選|cases_satisfying_antecedent_c...   842.000000
output_label=美國大選|confidence                           1.000000
output_label=美國大選|correctly_classified_count         842.000000
output_label=美國大選|incorrectly_classified_count         0.000000
output_label=美國大選|lift                                 4.624703
output_label=美國大選|support                              0.216230
output_label=足球|cases_satisfying_antecedent_count   2123.000000
output_label=足球|confidence                            

In [3]:
######Question 3b scrapper for validation set
import requests
import time
from random import randint
import pandas as pd

url_template = 'https://web-data.api.hk01.com/v2/articles/{}'
raw_train_csv = pd.read_csv('offsite-tagging-training-set (1).csv')
raw_validation_csv = pd.read_csv('offsite-tagging-test-set (1).csv')

headers = {'user-agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'}

all_possible_outputs = raw_train_csv['tags'].unique().tolist()
def get_request(id):
    url = url_template.format(id)
    target_tags = []
    try:
        request_json = requests.get(url,headers=headers).json()
        tags = request_json['tags']
        target_tags = [tag['tagName'] for tag in tags if tag['tagName'] in all_possible_outputs]
        time.sleep(randint(0,3))
        print(target_tags)
    except:
        print('{} fail'.format(id))
    return str(target_tags)

print('running')
# print(raw_validation_csv['id'].head(1).apply(get_request))

raw_validation_csv['tags'] = raw_validation_csv['id'].apply(get_request)

raw_validation_csv.to_csv('raw_validation_csv.csv')
print('finish scrapping')





running
['足球']
['梁振英']
['足球']
['足球']
['梁振英']
['梁振英']
['梁振英']
['足球']
['足球']
['足球']
['足球']
['足球']
['足球']
['足球']
['足球']
['足球']
['足球']
['梁振英']
['梁振英']
['梁振英']
['美國大選']
['梁振英']
['梁振英']
['梁振英']
['梁振英']
['梁振英']
['足球']
['梁振英']
['足球']
['梁振英']
['足球']
['梁振英']
['足球']
['足球']
['足球']
['足球']
['梁振英']
['美國大選']
['足球']
['美國大選']
['足球']
['梁振英']
['足球']
['足球']
['足球']
['足球']
['足球']
['足球']
['足球']
['足球']
['足球']
['美國大選']
['足球']
['足球']
['美國大選']
['足球']
['足球']
['足球']
['足球']
['足球']
['足球']
['美國大選']
['美國大選']
['美國大選']
['美國大選']
['足球']
['足球']
['美國大選']
['足球']
['足球']
['足球']
['足球']
['足球']
['梁振英']
['足球']
['足球']
['足球']
['足球']
['足球']
['足球']
['足球']
['美國大選']
['梁振英']
['美國大選']
['美國大選']
['美國大選']
['足球']
['梁振英']
['足球']
['足球']
['足球']
['美國大選']
['梁振英']
['足球']
['足球']
['足球']
['足球']
['足球']
['足球']
['足球']
['美國大選']
['足球']
['足球']
['足球']
['足球']
['梁振英']
['足球']
['足球']
['足球']
['梁振英']
['美國大選']
['梁振英']
['足球']
['足球']
['足球']
['梁振英']
['梁振英']
['美國大選']
['美國大選']
['美國大選']
['美國大選']
['足球']
['足球']
['美國大選']
['梁振英']
['美國大選']
['梁振英']
['美國大選']
['足球']
['足球']
['足球']