# KNN on the news

---
## Imports
This initial section just sets up all the imports we need (or maybe need, I forget if I actually use them all)

In [1]:
import pandas as pd
import numpy as np
import scipy as sp
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import NearestNeighbors


from sklearn.feature_extraction import stop_words as stop
import nltk               #natural language tool kit
import re                 #regular expression library
#nltk.download() 
#nltk.download('stopwords')
#nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
sw = set(stopwords.words("english"))

## Functions
The following section is all of the functions that will be used through out this notebook

In [50]:
# converts all strings in a dataframe of strings to lowercase
def df_toLowerCase(df):
    df_lower = df.copy()
    for col_index, col in df_lower.iteritems():
        df_lower[col_index] = col.str.lower()
    return df_lower

# sanitizes a string based upon a whole lot of regex
def line_sanitize(sent):
    sent = re.sub(r'u\.s\.','usa',sent)     #replaces u.s. with usa, later regex was causing u.s. to be lost
    sent = re.sub(r'[^a-z\s-]+',' ',sent)   #leaves all alph a chars and '-' chars, replaces all others with space
    sent = re.sub(r'-',' ',sent)            #replaces the '-' with a space
    sent = re.sub(r'(^[a-z]\s)|(\s[a-z]\s)|(\s[a-z]$)', ' ', sent)  #remove all single characters
    sent = re.sub(r'(^[\s,\t]*)|([\s,\t]*$)','',sent)     #remove space and tabs from start and end of lines
    sent = re.sub(r'\s{1,10}', ' ', sent)   #replaces anywhere with multiple spaces in a row with a single space
    return sent

# sanitizes all strings in a dataframe of strings
def df_sanitize(df):
    df_san = df.copy()
    for col_index, col in df_san.iteritems():
        df_san[col_index] = df_san[col_index].apply(line_sanitize)
    return df_san

# tokenizes all strings in a dataframe of stings
def df_tokenize(df):
    df_token = df.copy()
    for col_index, col in df_token.iteritems():
        df_token[col_index] = df_token[col_index].apply(word_tokenize)
    return df_token

# removes stop words from a dataframe
def df_removeStops(df):
    df_noStops = df.copy()
    for col_index, col in df_noStops.iteritems():
        df_noStops[col_index] = df_noStops[col_index].apply(line_removeStops)
    return df_noStops

# removes stop works from a single sentence
def line_removeStops(sentence):
    no_stops = [w for w in sentence if not w in sw]   #sw = list of stop words 
    return no_stops

# combines all healines columns into a single headline
def df_combineColumns(df):
    concat_columns = df.iloc[:,0].copy()
    for row_index, row in df.iterrows():
        single_string = ' '.join(str(x) for x in row)
        concat_columns[row_index] = single_string
    return concat_columns.to_frame(name='headlines')

# converts tokenized strings back into a string
def df_toStrings(df):
    df_strings = df.copy()
    for row_index, row in df_strings.iterrows():
        row_as_string = ''
        for col_index, col in row.iteritems():
            row_as_string = ' '.join(str(x) for x in col)
        df_strings.iloc[row_index,0] = row_as_string
    return df_strings

# simple prediction for 0,1 based labeling
def predict_from_indices(labels, indices, k):
    predictions = []
    for row in indices:
        predictions.append(round(sum(labels[row]) / (k * 1.)))
    return predictions

def predict_complex_labels(labels, indices, label_set):
    predictions = []
    for row in indices:
        values = labels[row].tolist()
        counts = []
        for x in label_set:
            counts.append(values.count(x))
        max_count = max(counts)
        max_indices = [i for i, x in enumerate(counts) if x == max_count]
        
        if len(max_indices) == 1:
            predictions.append(label_set[max_indices[0]])
        else :
            index_sum = sum(max_indices)
            new_index = round(1.0 * index_sum / len(label_set))
            predictions.append(label_set[new_index])

    return predictions

# Naive prediction based upon previous day's movement
def naive_ts_predict(labels, initial):
    predictions = []
    start = labels.keys()[0]
    end = labels.keys()[labels.shape[0] - 1] + 1
    for i in range(start,end):
        if i == start:
            predictions.append(initial)
        else :
            predictions.append(labels[i - 1])
    return predictions

# accuracy where correct are on the diagonal
def calculate_accuracy(results):
    total = np.sum(results)
    correct = 0;
    for i in range(0, results.shape[0]):
        correct = correct + results[i,i]
    return correct / total

## Data
First we will read in both the news only and the combined data sets and examine them for shape and missing data

In [3]:
news_original = pd.read_csv('/home/dev/CSCD429_Project/Data/RedditNews.csv')
news_original.head(1)

Unnamed: 0,Date,News
0,2016-07-01,A 117-year-old woman in Mexico City finally re...


In [4]:
combined_original = pd.read_csv('/home/dev/CSCD429_Project/Data/Combined_News_DJIA.csv')
combined_original.head(1)

Unnamed: 0,Date,Label,Top1,Top2,Top3,Top4,Top5,Top6,Top7,Top8,...,Top16,Top17,Top18,Top19,Top20,Top21,Top22,Top23,Top24,Top25
0,2008-08-08,0,"b""Georgia 'downs two Russian warplanes' as cou...",b'BREAKING: Musharraf to be impeached.',b'Russia Today: Columns of troops roll into So...,b'Russian tanks are moving towards the capital...,"b""Afghan children raped with 'impunity,' U.N. ...",b'150 Russian tanks have entered South Ossetia...,"b""Breaking: Georgia invades South Ossetia, Rus...","b""The 'enemy combatent' trials are nothing but...",...,b'Georgia Invades South Ossetia - if Russia ge...,b'Al-Qaeda Faces Islamist Backlash',"b'Condoleezza Rice: ""The US would not act to p...",b'This is a busy day: The European Union has ...,"b""Georgia will withdraw 1,000 soldiers from Ir...",b'Why the Pentagon Thinks Attacking Iran is a ...,b'Caucasus in crisis: Georgia invades South Os...,b'Indian shoe manufactory - And again in a se...,b'Visitors Suffering from Mental Illnesses Ban...,"b""No Help for Mexico's Kidnapping Surge"""


### Shape and Null Check

In [5]:
print("News Only Shape: " + str(news_original.shape))
print("News Only has null: " + str(news_original.isnull().values.any()))
print("Combined Shape: " + str(combined_original.shape))
print("Combined has null: " + str(combined_original.isnull().values.any()))
print("Data uniformity check: 25 * 1989 = " + str(25 * 1989))

News Only Shape: (73608, 2)
News Only has null: False
Combined Shape: (1989, 27)
Combined has null: True
Data uniformity check: 25 * 1989 = 49725


Notice here that 25 * 1989 = 49725 which is much smaller than the 73608 rows of the News only data set. This is because the news only data set includes ALL days in the time frame, including days when the markey was shut down. The combined data only includes those days where the market was open. For this reason I will proceed with only the combined data. A future task would be to include the market closure data.

---

### Data Preparation
We also notice that the Combined data has null values. We need to set those to empty strings for future processing.


In [6]:
combined_notNull = combined_original.replace(np.nan, '', regex=True)
print("New Combined has null: " + str(combined_notNull.isnull().values.any()))

New Combined has null: False


Now that nulls have been addressed, next we will chop off the date and result columns since we do not need them at this time, and I found it easier to not have to constantly index 2:27. They will be added back later

In [7]:
combined_text = combined_notNull.iloc[:,2:]
combined_text.head(1)

Unnamed: 0,Top1,Top2,Top3,Top4,Top5,Top6,Top7,Top8,Top9,Top10,...,Top16,Top17,Top18,Top19,Top20,Top21,Top22,Top23,Top24,Top25
0,"b""Georgia 'downs two Russian warplanes' as cou...",b'BREAKING: Musharraf to be impeached.',b'Russia Today: Columns of troops roll into So...,b'Russian tanks are moving towards the capital...,"b""Afghan children raped with 'impunity,' U.N. ...",b'150 Russian tanks have entered South Ossetia...,"b""Breaking: Georgia invades South Ossetia, Rus...","b""The 'enemy combatent' trials are nothing but...",b'Georgian troops retreat from S. Osettain cap...,b'Did the U.S. Prep Georgia for War with Russia?',...,b'Georgia Invades South Ossetia - if Russia ge...,b'Al-Qaeda Faces Islamist Backlash',"b'Condoleezza Rice: ""The US would not act to p...",b'This is a busy day: The European Union has ...,"b""Georgia will withdraw 1,000 soldiers from Ir...",b'Why the Pentagon Thinks Attacking Iran is a ...,b'Caucasus in crisis: Georgia invades South Os...,b'Indian shoe manufactory - And again in a se...,b'Visitors Suffering from Mental Illnesses Ban...,"b""No Help for Mexico's Kidnapping Surge"""


## Cleaning the Data

* Make all strings lower case
* Sanitize the string of punctuation, numbers, and single characters
* Combine all 25 headline columns into one single massive headline
* Remove stop words

In [8]:
combined_lower = df_toLowerCase(combined_text)
combined_lower.head(1)

Unnamed: 0,Top1,Top2,Top3,Top4,Top5,Top6,Top7,Top8,Top9,Top10,...,Top16,Top17,Top18,Top19,Top20,Top21,Top22,Top23,Top24,Top25
0,"b""georgia 'downs two russian warplanes' as cou...",b'breaking: musharraf to be impeached.',b'russia today: columns of troops roll into so...,b'russian tanks are moving towards the capital...,"b""afghan children raped with 'impunity,' u.n. ...",b'150 russian tanks have entered south ossetia...,"b""breaking: georgia invades south ossetia, rus...","b""the 'enemy combatent' trials are nothing but...",b'georgian troops retreat from s. osettain cap...,b'did the u.s. prep georgia for war with russia?',...,b'georgia invades south ossetia - if russia ge...,b'al-qaeda faces islamist backlash',"b'condoleezza rice: ""the us would not act to p...",b'this is a busy day: the european union has ...,"b""georgia will withdraw 1,000 soldiers from ir...",b'why the pentagon thinks attacking iran is a ...,b'caucasus in crisis: georgia invades south os...,b'indian shoe manufactory - and again in a se...,b'visitors suffering from mental illnesses ban...,"b""no help for mexico's kidnapping surge"""


In [9]:
combined_sanitized = df_sanitize(combined_lower)
combined_sanitized.head(1)

Unnamed: 0,Top1,Top2,Top3,Top4,Top5,Top6,Top7,Top8,Top9,Top10,...,Top16,Top17,Top18,Top19,Top20,Top21,Top22,Top23,Top24,Top25
0,georgia downs two russian warplanes as countri...,breaking musharraf to be impeached,russia today columns of troops roll into south...,russian tanks are moving towards the capital o...,afghan children raped with impunity n official...,russian tanks have entered south ossetia whils...,breaking georgia invades south ossetia russia ...,the enemy combatent trials are nothing but sha...,georgian troops retreat from osettain capital ...,did the usa prep georgia for war with russia,...,georgia invades south ossetia if russia gets i...,al qaeda faces islamist backlash,condoleezza rice the us would not act to preve...,this is busy day the european union has approv...,georgia will withdraw soldiers from iraq to he...,why the pentagon thinks attacking iran is bad ...,caucasus in crisis georgia invades south ossetia,indian shoe manufactory and again in series of...,visitors suffering from mental illnesses banne...,no help for mexico kidnapping surge


In [10]:
combined_singleCol = df_combineColumns(combined_sanitized)
combined_singleCol.head(1)

Unnamed: 0,headlines
0,georgia downs two russian warplanes as countri...


### Removing Stop Words

* Tokenize strings
* Remove stop words
* Convert back to strings

In [11]:
combined_tokens = df_tokenize(combined_singleCol)
combined_tokens.head(1)

Unnamed: 0,headlines
0,"[georgia, downs, two, russian, warplanes, as, ..."


In [12]:
combined_noStops = df_removeStops(combined_tokens)
combined_noStops.head(1)

Unnamed: 0,headlines
0,"[georgia, downs, two, russian, warplanes, coun..."


In [13]:
combined_fullClean = df_toStrings(combined_noStops)
combined_fullClean.head(1)

Unnamed: 0,headlines
0,georgia downs two russian warplanes countries ...


### Rejoin with Dates and Labels
At this point we can write our current data to a file and would have it to pass around and not have to continue cleaning data every time we want to run analysis

In [14]:
dates = combined_original[['Date','Label']]
clean_combined_full = dates.join(combined_fullClean)
clean_combined_full.head(5)
clean_combined_full.to_csv("CombinedClean.csv", sep=",", index=False)

### Word Count, Finally
So after all that cleaning we can look to see how many words we are left with. The CountVectorizer function will convert the strings into vectorized binary values for each word occurrence. Thus the number of columns is the number of unique words

In [15]:
cv = CountVectorizer()
combined_vectors = cv.fit_transform(combined_fullClean['headlines'])
print(combined_vectors.shape)

(1989, 33101)


In total all of the headlines constitute 33101 unique words

---

### Data Splitting
The last step in data modification is to split our data into TEST and TRAIN sets. This wasn't done previously to simply save some space and to ensure that the vectorization included all words we would be working with.

In [16]:
train = clean_combined_full[clean_combined_full['Date'] < '2015-01-01']
train_vectors = combined_vectors[0:train.shape[0]]
test = clean_combined_full[clean_combined_full['Date'] > '2014-12-31']
test_vectors = combined_vectors[train.shape[0]:]

## Analysis 1 - Naive TimeSeries
The most obvious technique would be to say "if it went up yesterday it will go up today." This technique is applied to provide a baseline for other techniques.

In [17]:
naive_predictions = naive_ts_predict(test['Label'],train['Label'][train.shape[0] - 1])
naive_results = pd.crosstab(test["Label"], np.array(naive_predictions), rownames=["Actual"], colnames=["Predicted"])

print(naive_results)
calculate_accuracy(naive_results.values)

Predicted    0    1
Actual             
0           84  102
1          103   89


0.4576719576719577

## Analysis 2 - Regression
Here the LogisticRegression function is used to perform regression on our behalf. Logistic regression is similar to LinearRegression except it predicts categorical values, not continuous.

In [18]:
logR = LogisticRegression()
logR = logR.fit(train_vectors, train["Label"])
logPredictions = logR.predict(test_vectors)
log_results = pd.crosstab(test["Label"], logPredictions, rownames=["Actual"], colnames=["Predicted"])

print(log_results)
calculate_accuracy(log_results.values)



Predicted   0    1
Actual            
0          63  123
1          99   93


0.4126984126984127

## Analysis 3 - KNN
Here we use the built in NearestNeighbors function. Once the indices of the nearest neighbors are found, a simple majority rule for labeling is applied.

In [60]:
k = 5
nn = NearestNeighbors(k)
nn = nn.fit(train_vectors)
distances, indices = nn.kneighbors(test_vectors)

knn_predictions = predict_from_indices(train['Label'], indices, k)
knn_results = pd.crosstab(test["Label"], np.array(knn_predictions), rownames=["Actual"], colnames=["Predicted"])

print(knn_results)
calculate_accuracy(knn_results.values)

Predicted   0    1
Actual            
0          56  130
1          54  138


0.5132275132275133

## Analysis 4 - KNN with multiple labels
The categories were expaned from simple 0,1 up/down values to more descriptive measurements of movement. the KNN indices were then evaluated based upon these new expanded labels. The results are so poor percentages were not calcualted.

In [53]:
full_labels = pd.read_csv('/home/dev/CSCD429_Project/Data/DJIA_labels.csv')
full_labels.head(1)

full_labels_train = full_labels[full_labels['Date'] < '2015-01-01']
full_labels_test = full_labels[full_labels['Date'] > '2014-12-31']

label_set = ["Down Extreme", "Down Large", "Down", "No Move", "Up", "Up Large", "Up Extreme"]
close_labels = predict_complex_labels(full_labels_train['CloseDiffCat'], indices, label_set)
pd.crosstab(full_labels_test["CloseDiffCat"], np.array(close_labels), rownames=["Actual"], colnames=["Predicted"])


Predicted,Down,Down Extreme,Down Large,No Move,Up,Up Extreme,Up Large
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Down,94,2,21,3,7,1,8
Down Extreme,1,0,0,0,0,0,0
Down Large,29,1,5,0,1,0,2
No Move,25,0,9,1,0,0,3
Up,98,0,20,2,5,0,6
Up Extreme,3,0,0,0,1,0,0
Up Large,14,1,7,1,5,0,2


In [54]:
label_set = ["Down Extreme", "Down Large", "Down", "No Move", "Up", "Up Large", "Up Extreme"]
close_labels = predict_complex_labels(full_labels_train['SpreadCat'], indices, label_set)
pd.crosstab(full_labels_test["SpreadCat"], np.array(close_labels), rownames=["Actual"], colnames=["Predicted"])

Predicted,Down,Down Large,No Move,Up,Up Extreme,Up Large
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Down,10,35,13,68,22,1
No Move,3,9,2,26,3,0
Up,12,33,11,68,19,4
Up Extreme,0,4,1,2,0,1
Up Large,6,4,3,16,2,0


In [55]:
label_set = ["Down Extreme", "Down Large", "Down", "No Move", "Up", "Up Large", "Up Extreme"]
close_labels = predict_complex_labels(full_labels_train['ONDiffCat'], indices, label_set)
pd.crosstab(full_labels_test["ONDiffCat"], np.array(close_labels), rownames=["Actual"], colnames=["Predicted"])

Predicted,Down,Down Large,No Move,Up
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Down,5,25,3,74
Down Extreme,0,8,1,11
Down Large,0,9,1,32
No Move,6,4,2,32
Up,5,28,4,75
Up Extreme,1,6,1,13
Up Large,2,4,1,25


In [56]:
label_set = ["Down Extreme", "Down Large", "Down", "No Move", "Up", "Up Large", "Up Extreme"]
close_labels = predict_complex_labels(full_labels_train['DailyMoveCat'], indices, label_set)
pd.crosstab(full_labels_test["DailyMoveCat"], np.array(close_labels), rownames=["Actual"], colnames=["Predicted"])

Predicted,Down,Down Extreme,Down Large,No Move,Up,Up Extreme,Up Large
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Down,99,2,23,3,8,1,8
Down Extreme,1,0,0,0,0,0,0
Down Large,25,1,4,0,0,0,2
No Move,30,0,7,0,0,0,2
Up,95,0,19,1,5,0,7
Up Extreme,3,0,0,0,1,0,0
Up Large,17,1,7,0,4,0,2
