In [2]:
import numpy as np
import pandas as pd
import seaborn as sb
from sklearn import svm
import re
from nltk.corpus import stopwords
from nltk import FreqDist
import random



In [3]:
def clear_stopwords(context):
    letters = re.sub("[^a-zA-Z]", " ", context)
    context = letters.lower().split()
    stopword = set(stopwords.words('english'))
    clear = [c for c in context if c not in stopword]
    return clear

def clear_html(raw_html):
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', raw_html)
    return cleantext

def get_word_list(data):
    content_word_list = []
    title_word_list = []
    tag_word_list = []  
    print len(data['content'])
    for content in data['content']:
        content_word_list += clear_stopwords(clear_html(content))
    for title in data['title']:
        title_word_list += clear_stopwords(title)
    for tag in data['tags']:
        tag_word_list += tag.split()
    fdist_title = FreqDist(title_word_list)
    fdist_content = FreqDist(content_word_list)
    fdist_tags = FreqDist(tag_word_list)
    return {"fdist_title": fdist_title, "fdist_tags": fdist_tags, "fdist_content": fdist_content}


In [4]:
biology = pd.read_csv("./input/biology.csv")
diy = pd.read_csv("./input/diy.csv")
travel = pd.read_csv("./input/travel.csv")
crypto = pd.read_csv("./input/crypto.csv")
robotics = pd.read_csv("./input/robotics.csv")
fdist_biology = get_word_list(biology)
fdist_diy = get_word_list(diy)
fdist_travel = get_word_list(travel)
fdist_crypto = get_word_list(crypto)
fdist_robotics  = get_word_list(robotics)
#fdist['fdist_tags'].plot(50,cumulative=True,title='tavel tag freq')

13196
25918
19279
10432
2771


In [5]:
def changeDist(fdist):
    keywords = {}
    content = {}
    title= {}
    for keyword in fdist['fdist_tags'].items():
        keywords[keyword[0]]=keyword[1]
    for w in fdist['fdist_content'].items():
        content[w[0]]=w[1]
    for w in fdist['fdist_title'].items():
        title[w[0]]=w[1]
    return {'keywords':keywords,'content':content,'title':title}

In [6]:
fdist_travel = changeDist(fdist_travel)
fdist_crypto = changeDist(fdist_crypto)
fdist_robotics = changeDist(fdist_robotics)
fdist_biology = changeDist(fdist_biology)
fdist_diy = changeDist(fdist_diy)

In [7]:
def getAlldist(Fdist):
    keywords = {}
    content = {}
    title = {}
    for dtype in Fdist:
        fdist = Fdist[dtype]
        for k in fdist['keywords']:
            if(not k in keywords):
                keywords[k] = 0
            keywords[k] += fdist['keywords'][k]
        for k in fdist['content']:
            if(not k in content):
                content[k] = 0
            content[k] += fdist['content'][k]
        for k in fdist['title']:
            if(not k in title):
                title[k] = 0
            title[k] += fdist['title'][k]
    return {'keywords':keywords,'content':content,'title':title}

In [8]:
def getStat(Fdist):
    stat = {"all":{}}
    for key in Fdist:
        stat[key] = {}
        fdist = Fdist[key]
        for ctype in fdist:
            if(not ctype in stat['all']):
                stat['all'][ctype] = 0
            stat[key][ctype] = 0
            for w in fdist[ctype]:
                stat['all'][ctype]+=fdist[ctype][w]
                stat[key][ctype]+=fdist[ctype][w]
    return stat

In [9]:
def findkeywordsInText(allDist,Fdist,stat):
    otherwords = {}
    word = []
    content_freq = []
    keywords_freq = []
    title_freq = []
    keywords_other_freq = []
    title_other_freq = []
    content_other_freq = []
    if_keyword = []
    dtype = []
    content_freq_rate = []
    keywords_freq_rate = []
    title_freq_rate = []
    keywords_other_freq_rate = []
    title_other_freq_rate = []
    content_other_freq_rate = []
    content_total_all = stat['all']['content']
    keywords_total_all  = stat['all']['keywords']
    title_total_all = stat['all']['title']
    
    for key in Fdist:
        content_total = stat[key]['content']
        keywords_total  = stat[key]['keywords']
        title_total = stat[key]['title']
        fdist = Fdist[key]
        words = {}
        for c in fdist['content']:
                words[c] = {'content':0,'title':0,'if_keyword':0,'content_other':0,'title_other':0,'keywords':0,'keywords_other':0}
                words[c]['content'] = fdist['content'][c]
                if(c in fdist['title']):
                    words[c]['title'] = fdist['title'][c] 
                if(c in fdist['keywords']):
                    words[c]['keywords'] = fdist['keywords'][c] 
                    words[c]['if_keyword'] =1
                if(c in allDist['title']):
                    words[c]['title_other'] = allDist['title'][c] - words[c]['title']
                if(c in allDist['content']):
                    words[c]['content_other'] = allDist['content'][c] - words[c]['content']
                if(c in allDist['keywords']):
                    words[c]['keywords_other'] = allDist['keywords'][c] - words[c]['keywords']
        for c in fdist['title']:
            if(not c in words):
                words[c] = {'content':0,'title':0,'if_keyword':0,'content_other':0,'title_other':0,'keywords':0,'keywords_other':0}
                words[c]['title'] = fdist['title'][c]
                if(c in fdist['content']):
                    words[c]['content'] = fdist['content'][c] 
                if(c in fdist['keywords']):
                    words[c]['keywords'] = fdist['keywords'][c] 
                    words[c]['if_keyword'] = 1
                if(c in allDist['title']):
                    words[c]['title_other'] = allDist['title'][c] - words[c]['title']
                if(c in allDist['content']):
                    words[c]['content_other'] = allDist['content'][c] - words[c]['content']
                if(c in allDist['keywords']):
                    words[c]['keywords_other'] = allDist['keywords'][c] - words[c]['keywords']
        for w in words:
            word.append(w)
            dtype.append(key)
            content_freq.append(words[w]['content'])
            content_other_freq.append(words[w]['content_other'])
            title_freq.append(words[w]['title'])
            title_other_freq.append(words[w]['title_other'])
            keywords_freq.append(words[w]['keywords'])
            keywords_other_freq.append(words[w]['keywords_other'])
            if_keyword.append(words[w]['if_keyword'])
            content_freq_rate.append(1000*float(words[w]['content'])/content_total)
            content_other_freq_rate.append(1000*float(words[w]['content_other'])/(content_total_all - content_total))
            title_freq_rate.append(1000*float(words[w]['title'])/title_total)
            title_other_freq_rate.append(1000*float(words[w]['title_other'])/(title_total_all - title_total))
            keywords_freq_rate.append(1000*float(words[w]['keywords'])/keywords_total)
            keywords_other_freq_rate.append(1000*float(words[w]['keywords_other'])/(keywords_total_all - keywords_total))
    
    df = pd.DataFrame(\
         {'dtype':dtype,\
          'word':word,\
         'if_keyword' : if_keyword,\
         'keywords':keywords_freq,\
         'keywords_other':keywords_other_freq,\
         'content':content_freq,\
         'content_other':content_other_freq,\
         'title':title_freq,\
         'title_other':title_other_freq,\
         'keywords_rate':keywords_freq_rate,\
         'keywords_other_rate':keywords_other_freq_rate,\
         'content_rate':content_freq_rate,\
         'content_other_rate':content_other_freq_rate,\
         'title_rate':title_freq_rate,\
         'title_other_rate':title_other_freq_rate\
        });    
    return df


In [10]:
fdist = {'travel':fdist_travel,'crypto':fdist_crypto,'robotics':fdist_robotics,'biology':fdist_biology,'diy':fdist_diy}
allDist = getAlldist(fdist)
stat = getStat(fdist)
df = findkeywordsInText(allDist,fdist,stat)
df.head(50)

Unnamed: 0,content,content_other,content_other_rate,content_rate,dtype,if_keyword,keywords,keywords_other,keywords_other_rate,keywords_rate,title,title_other,title_other_rate,title_rate,word
0,1,0,0.0,0.001545,biology,0,0,0,0.0,0.0,0,0,0.0,0.0,ofswarup
1,2,0,0.0,0.003089,biology,0,0,0,0.0,0.0,0,0,0.0,0.0,telmisartan
2,3,44,0.012802,0.004634,biology,0,0,0,0.0,0.0,0,1,0.002925,0.0,circuitry
3,3,0,0.0,0.004634,biology,0,0,0,0.0,0.0,0,0,0.0,0.0,clotted
4,38,16,0.004655,0.058693,biology,0,0,0,0.0,0.0,11,4,0.011699,0.159851,spiders
5,16,366,0.106491,0.024713,biology,0,0,86,0.549637,0.0,1,64,0.187178,0.014532,hanging
6,11,2,0.000582,0.01699,biology,0,0,0,0.0,0.0,3,0,0.0,0.043596,woody
7,3,3,0.000873,0.004634,biology,0,0,0,0.0,0.0,1,0,0.0,0.014532,trawling
8,10,22,0.006401,0.015446,biology,0,0,0,0.0,0.0,0,1,0.002925,0.0,localized
9,1,0,0.0,0.001545,biology,0,0,0,0.0,0.0,0,0,0.0,0.0,yanagisawa


In [11]:
def randomSelect(df):
    keywords = df[df['if_keyword']==1]
    other = df[df['if_keyword']==0]
    kl = keywords.shape[0]
    ol = other.shape[0]
    selectList = np.arange(0,ol,1)
    selectFlag = [(random.random()<= float(kl)/ol) for  i in range(ol)]
    return other.iloc[selectList[np.array(selectFlag)],:].append(keywords, ignore_index=True)
    

In [12]:
test_df = df[df['dtype']=='biology']
train_df = randomSelect(df[df['dtype']!='biology'])
test_df = test_df[['if_keyword','title_rate','title_other_rate','content_rate','content_other_rate','keywords_other_rate']]
train_df = train_df[['if_keyword','title_rate','title_other_rate','content_rate','content_other_rate','keywords_other_rate']]
train_df

Unnamed: 0,if_keyword,title_rate,title_other_rate,content_rate,content_other_rate,keywords_other_rate
0,0,0.000000,0.030843,0.027434,0.037188,0.000000
1,0,0.000000,0.000000,0.002195,0.000945,0.000000
2,0,0.000000,0.000000,0.001097,0.000000,0.000000
3,0,0.008408,0.000000,0.001097,0.000315,0.000000
4,0,0.000000,0.000000,0.001097,0.000000,0.000000
5,0,0.000000,0.000000,0.001097,0.000000,0.000000
6,0,0.008408,0.041123,0.003292,0.041915,0.000000
7,0,0.000000,0.000000,0.001097,0.000000,0.000000
8,0,0.000000,0.000000,0.008779,0.000000,0.000000
9,0,0.000000,0.000000,0.001097,0.000000,0.000000


In [13]:
train_data = train_df.values
test_data = test_df.values
train_data[0:200,::]

array([[ 0.        ,  0.        ,  0.03084251,  0.02743448,  0.03718785,
         0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.00219476,  0.00094545,
         0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.00109738,  0.        ,
         0.        ],
       ..., 
       [ 0.        ,  0.02522492,  0.00685389,  0.01316855,  0.00598787,
         0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.00109738,  0.        ,
         0.        ],
       [ 0.        ,  0.01681661,  0.        ,  0.031824  ,  0.00189091,
         0.        ]])

In [14]:
clf = svm.SVC()
clf.fit(train_data[0::,1::], train_data[0::,0]) 

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [15]:
pre = clf.predict(test_data[0::,1::])

In [16]:
prelist = [pre[i]==1 for i in range(len(pre))]
prelist
test_df[np.array(prelist)]
test_df[test_df['if_keyword']==1]

Unnamed: 0,if_keyword,title_rate,title_other_rate,content_rate,content_other_rate,keywords_other_rate
70,1,8.050687,0.000000,4.049827,0.003201,0.000000
118,1,0.450490,0.002925,0.223961,0.000873,0.000000
834,1,0.159851,0.000000,0.080317,0.000000,0.000000
841,1,0.406894,0.093589,0.203881,0.040734,0.000000
842,1,0.000000,0.000000,0.003089,0.000000,0.000000
954,1,0.000000,0.000000,0.003089,0.000000,0.000000
980,1,0.000000,0.000000,0.001545,0.000000,0.000000
1159,1,0.043596,0.000000,0.027802,0.000000,0.000000
1223,1,0.145319,0.064342,0.115842,0.034042,0.000000
1224,1,0.276107,0.000000,0.143644,0.000000,0.000000
