# Imports

In [1]:
import glob
import copy
import nltk
import scipy
import multiprocessing
import operator, os, pickle
import imp
import re

import numpy as np
import pandas as pd
import utils as my_utils

from nltk.corpus import stopwords
from bs4 import BeautifulSoup as bs
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier

from tqdm import tqdm
from collections import Counter
from itertools import combinations
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.model_selection import train_test_split

# Necessary Functions

In [2]:
stop_words = stopwords.words('english')
lemmatizer = nltk.stem.WordNetLemmatizer()
stemmer = nltk.stem.PorterStemmer()
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
       
def preprocess(pd):
    pd = pd.str.lower()
    pd = pd.str.replace('[^a-zA-Z]', ' ')
    pd = pd.apply(lambda x: [w for w in w_tokenizer.tokenize(str(x))])
    pd = pd.str.join(' ')
    
    pd = pd.apply(lambda x: [lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(x)])    
    pd = pd.apply(lambda x: [item for item in x if len(item)>1])
    return pd

def process_df_body(df):
    df['body_text'] = preprocess(df['body']).apply(lambda x: " ".join(x))
    return df

def process_df_title(df):
    df['title_text'] = preprocess(df['title']).apply(lambda x: " ".join(x))
    return df

In [3]:
path = "../nontracked/"
dataset_name = "dataset_cleaned_document_classification"
csv_name = "items_old.csv"

# Data Loading

In [4]:
if os.path.isfile(path + dataset_name):
    print("\nFound Cache File, Loading...")
    dataset = pd.read_pickle(path + dataset_name)
else:
    print("\nCache Not Found, Generating Cache...")
    
    dataset = pd.read_csv(path + csv_name, header=None)
    dataset = dataset.dropna()
    dataset = dataset.rename(columns={1:'title', 2:'body'})

    dataset['body'] = dataset['body'].astype(str)
    dataset['title'] = dataset['title'].astype(str)

    n_cores = 45

    n = int(dataset.shape[0]/n_cores)

    pool = multiprocessing.Pool(n_cores)
    list_df = [dataset[i:i+n] for i in range(0, dataset.shape[0], n)]

    print("Processing title...")
    processed_list_df = pool.map(process_df_title, list_df)
    pool.close()
    dataset = pd.concat(processed_list_df)

    print("Processing body...")
    pool = multiprocessing.Pool(n_cores)
    list_df = [dataset[i:i+n] for i in range(0, dataset.shape[0],n)]

    processed_list_df = pool.map(process_df_body, list_df)
    pool.close()
    dataset = pd.concat(processed_list_df)

    dataset['text'] = dataset[['title_text', 'body_text']].agg(' '.join, axis=1).astype(str)

    dataset.to_pickle(path + dataset_name)
    print("Cache dumped...")


Found Cache File, Loading...


In [5]:
advt = dataset[['title', 'body']].agg(' '.join, axis=1).str.lower()

In [6]:
a_1 = advt.apply(lambda x: True if (('amazon deal' in x) or ('$' in x and 'amazon' in x) or ('price' in x and 'drop' in x) or ('deals' in x) or ('limited' in x and 'time' in x)) else False)

In [7]:
advt_ = dataset[a_1]

In [8]:
advt_['y'] = 'advt'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [9]:
# stackexchange.com
# stackoverflow.com
# superuser.com
# serverfault.com
# community.byte.co
# discuss.linuxcontainers.org
# forum.lazarus.freepascal.org
# forums.lutris.net
# forum.asrock.com
# answers.yahoo.com
# answers.com
# techcommunity.microsoft.com
# uberpeople.net
# quora.com
# forums.macrumors.com
# community.shopify.com
# forums.wyzecam.com
# community.spotify.com
# forums.whonix.org
# forum.opnsense.org
# forum.odroid.com

In [10]:
forum = dataset[dataset[3].apply(lambda x: True if ('stackexchange.com' in x or 'stackoverflow.com' in x or 'superuser.com' in x or 'serverfault.com' in x or 'community.byte.co' in x or 'discuss.linuxcontainers.org' in x or 'forum.lazarus.freepascal.org' in x or 'forums.lutris.net' in x or 'forum.asrock.com' in x or 'answers.yahoo.com' in x or 'answers.com' in x or 'techcommunity.microsoft.com' in x or 'uberpeople.net' in x or 'quora.com' in x or 'forums.macrumors.com' in x or 'community.shopify.com' in x or 'forums.wyzecam.com' in x or 'community.spotify.com' in x or 'forums.whonix.org' in x or 'forum.opnsense.org' in x or 'forum.odroid.com' in x) else False)]
forum['y'] = 'forum'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [11]:
forum.shape

(107901, 10)

In [12]:
# arstechnica.com
# blogs.nvidia.com
# variety.com
# businessinsider.com
# spacedaily.com
# theguardian.com
# interestingengineering.com
# nysenasdaqlive.com
# seekingalpha.com
# finance.yahoo.com
# aithority.com
# techcrunch.com
# bloombergquint.com
# theverge.com
# aljazeera.com
# cnbc.com
# vox.com
# nme.com
# nytimes.com
# seattletimes.com
# telegraph.co.uk
# ben-evans.com
# thedrum.com
# ft.com
# cnn.com
# bloomberg.com
# reuters.com
# ibtimes.com
# apnews.com
# ibtimes.com
# usatoday.com
# hackernoon.com
# thenextweb.com
# venturebeat.com
# informationweek.com

In [13]:
good = dataset[dataset[3].apply(lambda x: True if ('arstechnica.com' in x or 'blogs.nvidia.com' in x or 'variety.com' in x or 'businessinsider.com' in x or 'spacedaily.com' in x or 'theguardian.com' in x or 'interestingengineering.com' in x or 'nysenasdaqlive.com' in x or 'seekingalpha.com' in x or 'finance.yahoo.com' in x or 'aithority.com' in x or 'techcrunch.com' in x or 'bloombergquint.com' in x or 'theverge.com' in x or 'aljazeera.com' in x or 'cnbc.com' in x or 'vox.com' in x or 'nme.com' in x or 'nytimes.com' in x or 'seattletimes.com' in x or 'telegraph.co.uk' in x or 'theverge.com' in x or 'thedrum.com' in x or 'ft.com' in x or 'cnn.com' in x or 'bloomberg.com' in x or 'reuters.com' in x or 'ibtimes.com' in x or 'apnews.com' in x or 'venturebeat.com' in x or 'usatoday.com' in x or 'hackernoon.com' in x or 'thenextweb.com' in x or 'informationweek.com' in x) else False)]
good['y'] = 'good'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [14]:
advt_.shape, forum.shape, good.shape

((129944, 10), (107901, 10), (73905, 10))

In [15]:
subset = pd.concat([advt_, forum, good])

In [16]:
g = subset.groupby('y')
subset = pd.DataFrame(g.apply(lambda x: x.sample(g.size().min()).reset_index(drop=True))).reset_index(drop=True)

In [17]:
Counter(subset.y)

Counter({'advt': 73905, 'forum': 73905, 'good': 73905})

In [18]:
# POS
# Numbers
# Alphabets
# Special Characters
# Count of uppercase
# Count of lowercase
# Number of words
# total character count

In [19]:
subset_tr, subset_te = train_test_split(subset, test_size=0.3, random_state=42)

In [20]:
subset_tr['raw_text'] = subset_tr[['title', 'body']].agg(' '.join, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [21]:
subset_te['raw_text'] = subset_te[['title', 'body']].agg(' '.join, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [22]:
ner = set(pd.read_csv("ner_list", header=None, sep="\\").values.reshape(-1))
ner = {i:idx for idx, i in enumerate(ner)}

In [23]:
def get_pos_tags(text):
    pos = nltk.pos_tag(word_tokenize(text))

    m = {i:0 for i in [j[1] for j in pos]}
    m.keys
    for _, j in pos:
        m[j] += 1
    n = [0] * len(ner)
    for k, v in m.items():
        n[ner[k]] = v
    return n

In [24]:
def get_special_count(inp):
    text, raw_text = inp
    v_raw_tl = len(raw_text)
    v_proc_tl = len(text)
    v_raw_wc = len(word_tokenize(raw_text))
    v_proc_wc = len(word_tokenize(text))
    v_num = len(re.findall(r'([\d])', raw_text))
    v_uppercase = len(re.findall(r'([A-Z])', raw_text))
    v_lower = len(re.findall(r'([a-z])', raw_text))
    v_spl_chrs = len(re.findall(r'([^(a-zA-Z\d \n)])', raw_text))
    v_new_line = len(re.findall(r'([\n])', raw_text))
    return [v_raw_tl, v_proc_tl, v_raw_wc, v_proc_wc, v_num, v_uppercase, v_lower, v_spl_chrs, v_new_line]

In [25]:
n_cores = 45

In [26]:
%%time
pool = multiprocessing.Pool(n_cores)
ner_list = pool.map(get_pos_tags, subset_tr['raw_text'].tolist())
pool.close()

CPU times: user 2.21 s, sys: 18.4 s, total: 20.6 s
Wall time: 3min 15s


In [27]:
%%time
pool = multiprocessing.Pool(n_cores)
te_ner_list = pool.map(get_pos_tags, subset_te['raw_text'].tolist())
pool.close()

CPU times: user 1.06 s, sys: 31.5 s, total: 32.5 s
Wall time: 1min 47s


In [28]:
pool = multiprocessing.Pool(n_cores)
special_counts = pool.map(get_special_count, subset_tr[['text', 'raw_text']].values.tolist())
pool.close()

In [29]:
pool = multiprocessing.Pool(n_cores)
te_special_counts = pool.map(get_special_count, subset_te[['text', 'raw_text']].values.tolist())
pool.close()

In [30]:
min_df = 20
max_df = .6
max_features = 50000

In [31]:
vectorizer = TfidfVectorizer(analyzer='word',
                             stop_words="english", max_features=max_features,
                             max_df=max_df, min_df=min_df)

In [32]:
train_vec = vectorizer.fit_transform(subset_tr['text'])

In [33]:
test_vec = vectorizer.transform(subset_te['text'])

In [34]:
len(vectorizer.get_feature_names())

40387

In [35]:
X_train = np.concatenate([ner_list, special_counts, train_vec.toarray()], axis=1)

In [36]:
X_test = np.concatenate([te_ner_list, te_special_counts, test_vec.toarray()], axis=1)

In [57]:
clf = RandomForestClassifier(n_estimators=50, n_jobs=-1)

In [58]:
clf

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=-1,
                       oob_score=False, random_state=None, verbose=0,
                       warm_start=False)

In [59]:
clf.fit(X_train, subset_tr['y'])

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=-1,
                       oob_score=False, random_state=None, verbose=0,
                       warm_start=False)

In [39]:
def preprocessing_new(text):
    text = [lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(re.sub(r'[^a-zA-Z]', " ", text.lower()))]
    return " ".join([i for i in text if len(i)>1])

In [41]:
raw_text = subset_tr.iloc[0]['raw_text']

In [42]:
text = preprocessing_new(raw_text)

In [43]:
te_raw_pos = get_pos_tags(raw_text)

In [44]:
te_spl_count = get_special_count([text, raw_text])

In [45]:
te_vec = vectorizer.transform([text]).toarray()[0].tolist()

In [46]:
X_te = te_raw_pos + te_spl_count + te_vec

In [47]:
ret = {}
for i, j in zip(clf.classes_, clf.predict_proba([X_te])[0]):
    ret[i] = j

In [60]:
clf.score(X_test, subset_te['y'])

0.8630835149966173

In [61]:
import joblib

In [62]:
joblib.dump(vectorizer, "vectorizer")

['vectorizer']

In [64]:
joblib.dump(clf, "clf")

['clf']

In [None]:
# def get_pos_name(j):
#     return [i[1] for i in nltk.pos_tag(word_tokenize(j))]

In [None]:
# %%time
# pool = multiprocessing.Pool(n_cores)
# ner_list = pool.map(get_pos_name, subset_tr['raw_text'].tolist())
# pool.close()

In [None]:
# ner = set()
# for i in tqdm(raw_text_results):
#     ner |= set(i.keys())
# pd.DataFrame([ner]).T.to_csv("ner_list", index=None, header=None)

In [None]:

#     filenames = glob.glob(path + "items_old.csv")

#     dataset = []
#     for filename in filenames:
#         dataset.append(pd.read_csv(filename, header=None))
#     dataset = pd.concat(dataset)
    