# text's feature engineering: turn unstructure to  structure

In [1]:
import re
import pandas as pd
import numpy as np
import pickle
import os

## turn back to main directory
os.chdir("../")
os.getcwd()

'/Users/yoga/Documents/GitHub/aiaRNN/class_text_mining'

In [2]:
df = pd.read_csv('data/article_preprocessed.csv')

In [3]:
## load 'article_cutted'
with open("article_cutted", "rb") as file:
    sentences = pickle.load(file)

## define y (push > boo)

In [4]:
## drop data
diff_threshold = 20
df = df[abs(df['push']-df['boo']) > diff_threshold].copy()

In [5]:
## define y
df['type'] = np.clip(df['push']-df['boo'], 0, 1)
df = df.reset_index(drop=True)

In [6]:
df['type'].value_counts()

1    17318
0     1134
Name: type, dtype: int64

## simple feature

In [7]:
## word count
## http://blog.csdn.net/gatieme/article/details/43235791 (中文正則表達式)
df['word_count'] = df['content'].str.count('[a-zA-Z0-9]+') + df['content'].str.count('[\u4e00-\u9fff]')

In [14]:
## punctuation count
## 請產生 "標點符號數" 欄位
df['punctuation count'] = df['content'].str.replace('[\w\s]', '').str.len()

In [19]:
## question mark count
## 請產生 "問號數" 欄位
df['question mark count'] = df['content'].str.count('[?？]')

In [None]:
## 可以自由發揮想像還有什麼 features , etc. 比例

In [10]:
## drop punctuation column
df = df.drop(['punctuation'],axis=1)

In [20]:
df.iloc[:5, -4:]

Unnamed: 0,type,word_count,punctuation count,question mark count
0,1,175,0,0
1,1,145,4,1
2,1,393,17,8
3,1,295,15,6
4,1,41,4,0


In [22]:
## compute correlation
## 請計算前面建立好的 features 與 type 的 correlation
df.iloc[:, -4:].corr()

Unnamed: 0,type,word_count,punctuation count,question mark count
type,1.0,-0.0451,-0.024124,-0.056966
word_count,-0.0451,1.0,0.738419,0.5349
punctuation count,-0.024124,0.738419,1.0,0.34215
question mark count,-0.056966,0.5349,0.34215,1.0


## bag of words

In [23]:
from sklearn.feature_extraction.text import CountVectorizer

In [24]:
## define transformer (轉換器)
vectorizer = CountVectorizer()
count = vectorizer.fit_transform([' '.join(x) for x in sentences])

In [25]:
count

<252229x372654 sparse matrix of type '<class 'numpy.int64'>'
	with 7700068 stored elements in Compressed Sparse Row format>

In [26]:
## save data as pickle format
with open("article_count", "wb") as file:
    pickle.dump([vectorizer, count], file)

### select top 10 frequency of words

In [None]:
## 用前面轉換好的 count matrix, 嘗試抽出 top 10 count 最多的關鍵字
## 建立 top 10 count 關鍵字的 bag of words features, 計算其與 type 的 correlation 


# TF-IDF

In [109]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [110]:
## define transformer (轉換器)
vectorizer = TfidfVectorizer(norm=None) ## do not do normalize
tfidf = vectorizer.fit_transform([' '.join(x) for x in sentences])

In [111]:
## save data as pickle format
with open("article_tfidf", "wb") as file:
    pickle.dump([vectorizer, tfidf], file)

### select top 10 average tf-idf of words

In [None]:
## 用前面轉換好的 tfidf matrix, 嘗試抽出 top 10 平均 tfidf 最高的關鍵字
## 建立 top 10 平均 tfidf 關鍵字的 tf-idf features, 計算其與 type 的 correlation 