In [1]:
import lightgbm as lgb
import json
import pandas as pd
import nltk
from nltk.corpus import stopwords 
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize 
from sklearn.feature_extraction.text import CountVectorizer as CV
from sklearn.feature_extraction.text import TfidfVectorizer as TV
from sklearn.model_selection import train_test_split
import numpy as np
import pickle
np.random.seed(109)

In [2]:
p1 = r"..\cybernews\data\jsons\Feb.json"
p2 = r"..\cybernews\data\jsons\March.json"

js_ls = []
for file in [p1,p2]:
    with open(file, encoding="utf-8") as fp:
        js = json.load(fp)
        for el in js:
            js_ls.append(el)
df = pd.json_normalize(js_ls)

In [3]:
df.ranking = abs(df.ranking - df.ranking.max())+1

In [4]:
df.head()

Unnamed: 0,title,url,ranking,filename,body,date,source,author
0,White House announces senior official Is leadi...,https://www.nytimes.com/2021/02/10/us/politics...,7,Cyber_Clips_February_11_2021.docx,WASHINGTON — The White House announced on Wed...,2021-02-10 00:00:00,New York Times,"[Julian E. Barnes, David E. Sanger]"
1,Report highlights cyber risks to US election s...,https://www.cyberscoop.com/florida-water-hack-...,6,Cyber_Clips_February_11_2021.docx,A hack that apparently affected a Florida wat...,2021-02-10 00:00:00,CyberScoop,[Sean Lyngaas]
2,We Must Reorient US Cyber Strategy Around the ...,https://news.yahoo.com/report-highlights-cyber...,5,Cyber_Clips_February_11_2021.docx,ATLANTA (AP) — Election systems in the U.S. a...,2021-02-10 00:00:00,Associated Press,[Christina A. Cassidy]
3,White House announces senior official Is leadi...,https://www.defenseone.com/ideas/2021/02/five-...,4,Cyber_Clips_February_11_2021.docx,This oped is adapted from Dmitri Alperovitch'...,2021-02-10 00:00:00,Defense One,[Dmitri Alperovitch]
4,Florida hack highlights security shortages in ...,https://www.brookings.edu/blog/techtank/2021/0...,3,Cyber_Clips_February_11_2021.docx,America’s networks are under attack. While th...,2021-02-11 00:00:00,Brookings,[Tom Wheeler]


In [5]:
files = pd.unique(df["filename"])
np.random.shuffle(files)
[train, val, test] = np.split(files, [int(.8 * len(files)), int(.9 * len(files))])


In [6]:
def match(row):
    if row.filename in train: 
        return "train"
    if row.filename in val:
        return "validation"
    if row.filename in test:
        return "test"
        

In [7]:
df["ttv"] = df.apply(lambda row: match(row), axis=1)

In [8]:
X_train = df[df['ttv'] == "train"]
X_val = df[df['ttv'] == "validation"]
X_test = df[df['ttv'] == "test"]
X_train = X_train.sort_values(by="filename")
X_val = X_val.sort_values(by="filename")
X_test = X_test.sort_values(by="filename")
y_train = X_train["ranking"]
y_val = X_val["ranking"]
y_test = X_test["ranking"]

In [9]:
train_group=X_train.groupby("filename").count()["title"].tolist()
val_group=X_val.groupby("filename").count()["title"].tolist()

In [11]:
stops = stopwords.words('english')
analyze = CV(stop_words=stops, min_df = 15, max_df=.5)
ps = PorterStemmer()
def stop_removal(text : str): 
    toks=analyze(text)
    return ' '.join([ps.stem(word) for word in toks if not ps.stem(word) in stops])

In [12]:
analyze = TV(stop_words=stops, min_df = 15, max_df=.5)
sparse_train = analyze.fit_transform(X_train["body"]).astype('float32')
sparse_val = analyze.transform(X_val["body"]).astype('float32')

In [13]:
gbm = lgb.LGBMRanker(max_depth=5)
gbm.fit(sparse_train, y_train, group=train_group, 
        eval_set=[(sparse_val, y_val)], eval_group = [val_group],
        early_stopping_rounds = 50)

[1]	valid_0's ndcg@1: 0.417323	valid_0's ndcg@2: 0.409914	valid_0's ndcg@3: 0.525944	valid_0's ndcg@4: 0.542173	valid_0's ndcg@5: 0.602543
Training until validation scores don't improve for 50 rounds
[2]	valid_0's ndcg@1: 0.35958	valid_0's ndcg@2: 0.373503	valid_0's ndcg@3: 0.477998	valid_0's ndcg@4: 0.533905	valid_0's ndcg@5: 0.642896
[3]	valid_0's ndcg@1: 0.506562	valid_0's ndcg@2: 0.586348	valid_0's ndcg@3: 0.628844	valid_0's ndcg@4: 0.720667	valid_0's ndcg@5: 0.731356
[4]	valid_0's ndcg@1: 0.501312	valid_0's ndcg@2: 0.584872	valid_0's ndcg@3: 0.737234	valid_0's ndcg@4: 0.766009	valid_0's ndcg@5: 0.780098
[5]	valid_0's ndcg@1: 0.501312	valid_0's ndcg@2: 0.625232	valid_0's ndcg@3: 0.744897	valid_0's ndcg@4: 0.765802	valid_0's ndcg@5: 0.802799
[6]	valid_0's ndcg@1: 0.669291	valid_0's ndcg@2: 0.672449	valid_0's ndcg@3: 0.773467	valid_0's ndcg@4: 0.794916	valid_0's ndcg@5: 0.840932
[7]	valid_0's ndcg@1: 0.669291	valid_0's ndcg@2: 0.828843	valid_0's ndcg@3: 0.8434	valid_0's ndcg@4: 0.830

LGBMRanker(max_depth=5)

In [14]:
val_pred=gbm.predict(sparse_val)

In [15]:
len(gbm.evals_result_["valid_0"]["ndcg@1"])

57

In [16]:
X_val["pred"] = val_pred


In [17]:
gbm.best_score_

defaultdict(collections.OrderedDict,
            {'valid_0': OrderedDict([('ndcg@1', 0.6692913385826772),
                          ('ndcg@2', 0.828843122965223),
                          ('ndcg@3', 0.8433997887810257),
                          ('ndcg@4', 0.8305029187876253),
                          ('ndcg@5', 0.8665840967086954)])})

In [20]:
pickle.dump(analyze, open("../models/analyze.pickle", "wb"))
pickle.dump(gbm, open("../models/gbm.pickle", "wb"))
