In [None]:
#Importing the required libraries
from gensim.models import Word2Vec, word2vec
import logging
import matplotlib.pyplot as plt
import nltk
import numpy as np
from nltk.corpus import stopwords
import numpy as np
import os
import pandas as pd
import re
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler

In [None]:
# Load the punkt tokenizer used for splitting reviews into sentences
nltk.download('punkt')
nltk.download('stopwords')
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
main_data = pd.read_csv('dataset.csv')
main_data.head()

Unnamed: 0,news,type
0,China had role in Yukos split-up\n \n China le...,business
1,Oil rebounds from weather effect\n \n Oil pric...,business
2,Indonesia 'declines debt freeze'\n \n Indonesi...,business
3,$1m payoff for former Shell boss\n \n Shell is...,business
4,US bank in $515m SEC settlement\n \n Five Bank...,business


In [None]:
print(main_data.shape)

(2225, 2)


In [None]:
num_data, meta_data = pd.factorize(y_train)

NameError: ignored

In [None]:
print(num_data)

[0 0 0 ... 4 4 4]


In [None]:
#Convert a news to a list of words
def news_to_wordlist(news, remove_stopwords=False):
    """
    Arguments:
    news = news from the dataset
    remove_stopwords = boolean value

    Output:
    words = lists of words in a aingle news
    """
    # remove non-letters
    news_text = re.sub("[^a-zA-Z]"," ", news)

    # convert to lower case and split at whitespace
    words = news_text.lower().split()

    # remove stop words (false by default)
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]

    return words

In [None]:
#Split news into list of sentences where each sentence is a list of words.
def news_to_sentences(news, tokenizer, remove_stopwords=False):
    """
    Arguments:
    news = news to be tokenized
    tokenizer = the nltk tokenizer
    remove_stopwords = boolean value

    Output:
    sentences = list of sentences where each sentence is also a list of words
    """
    # use the NLTK tokenizer to split the paragraph into sentences
    raw_sentences = tokenizer.tokenize(news.strip())

    # each sentence is furthermore split into words
    sentences = []
    for raw_sentence in raw_sentences:
        # If a sentence is empty, skip it
        if len(raw_sentence) > 0:
            sentences.append(news_to_wordlist(raw_sentence, remove_stopwords))

    return sentences

In [None]:
train_sentences = []  # Initialize an empty list of sentences
for news in main_data['news']:
    # Converting each news into sentences
    train_sentences += news_to_sentences(news, tokenizer)

In [None]:
train_sentences[0]

['china',
 'had',
 'role',
 'in',
 'yukos',
 'split',
 'up',
 'china',
 'lent',
 'russia',
 'bn',
 'bn',
 'to',
 'help',
 'the',
 'russian',
 'government',
 'renationalise',
 'the',
 'key',
 'yuganskneftegas',
 'unit',
 'of',
 'oil',
 'group',
 'yukos',
 'it',
 'has',
 'been',
 'revealed']

In [None]:
#Set values for various word2vec parameters
num_features = 300    # Word vector dimensionality
min_word_count = 40   # Minimum word count
num_workers = 3       # Number of threads to run in parallel
context = 10          # Context window size
downsampling = 1e-3   # Downsample setting for frequent words

# Initialize and train the model (this will take some time)
model = word2vec.Word2Vec(train_sentences, workers=num_workers, size=num_features, min_count = min_word_count, window = context, sample = downsampling)

model.init_sims(replace=True)

# Saving the model for later use.
model.save('model_name')

# Loading the model
model = Word2Vec.load('model_name')

In [None]:
#Average the word vectors for a set of words
def make_feature_vec(words, model, num_features):
    """
    Arguments:
    words = words in a news
    model = the word2Vec model
    num_features = number of features for the model

    Output:
    feature_vec = returns the average word vector from the feature vec of a set of words
    """
    feature_vec = np.zeros((num_features,),dtype="float32")  # pre-initialize (for speed)
    nwords = 0
    index2word_set = set(model.wv.index2word)  # words known to the model

    for word in words:
        if word in index2word_set:
            nwords = nwords + 1
            feature_vec = np.add(feature_vec,model[word])

    feature_vec = np.divide(feature_vec, nwords)
    return feature_vec

In [None]:
#Calculate average feature vectors for all reviews
def get_avg_feature_vecs(news, model, num_features):
    """
    Arguments:
    news = news
    model = the word2Vec model
    num_features = number of features in the feature vector

    Output:
    news_feature_vecs = Feature vector of each news by averaging feature vectors of each words in a news
    """
    counter = 0
    news_feature_vecs = np.zeros((len(news),num_features), dtype='float32')  # pre-initialize (for speed)

    for new in news:
        news_feature_vecs[counter] = make_feature_vec(new, model, num_features)
        counter = counter + 1
    return news_feature_vecs

In [None]:
clean_train_news = []
for new in main_data['news']:
    clean_train_news.append(news_to_wordlist(new, remove_stopwords=True))
trainDataVecs = get_avg_feature_vecs(clean_train_news, model, num_features)



In [None]:
print(trainDataVecs[:5])

[[-2.0841040e-02 -3.5884351e-02  4.1210413e-02 ... -2.5158955e-02
   7.8683123e-03  4.9836688e-02]
 [-1.7440600e-02 -3.0171243e-02  2.6495187e-02 ... -2.5629971e-02
   6.3768323e-03  4.8588254e-02]
 [-5.0745177e-04 -3.8599502e-02  2.9329101e-02 ... -2.2326024e-02
   2.2007537e-03  3.6792152e-02]
 [-6.4780205e-03 -3.4437228e-02  1.7679147e-02 ... -1.2783796e-02
  -5.8734096e-03  3.2464299e-02]
 [-1.9787379e-02 -5.0354157e-02  2.9083764e-02 ... -2.5976785e-02
   2.3560506e-06  4.1006099e-02]]


In [None]:
print(len(trainDataVecs))

2225


In [None]:
#Random Forest Classifier
from collections import Counter

#function to calculate entropy
def entropy(y):
    """"
    Arguments:
    y = the labels

    Output:
    entropy of the input
    """
    hist = np.bincount(y)
    ps = hist / len(y)
    return -np.sum([p * np.log2(p) for p in ps if p > 0])

#Class for node of decision trees
class Node:
    #Initializing the class
    def __init__(self, feature=None, threshold=None, left=None, right=None, *, value=None):
        self.feature = feature
        self.threshold = threshold
        self.left = left
        self.right = right
        self.value = value

    #Checking whether the node is a leaf node or not
    def is_leaf_node(self):
        return self.value is not None

#Class for decision tree
class DecisionTree:

    #Initializing the class
    def __init__(self, min_samples_split=2, max_depth=100, n_feats=None):
        self.min_samples_split = min_samples_split
        self.max_depth = max_depth
        self.n_feats = n_feats
        self.root = None

    #Method to fit the decision tree
    def fit(self, X, y):
        self.n_feats = X.shape[1] if not self.n_feats else min(self.n_feats, X.shape[1])
        self.root = self._grow_tree(X, y)

    #Method to predict
    def predict(self, X):
        #traversing the tree from the root
        return np.array([self._traverse_tree(x, self.root) for x in X])

    #Method to grow trees
    def _grow_tree(self, X, y, depth=0):
        n_samples, n_features = X.shape
        n_labels = len(np.unique(y))

        # stopping criteria
        if (depth >= self.max_depth
                or n_labels == 1
                or n_samples < self.min_samples_split):
            leaf_value = self._most_common_label(y)
            return Node(value=leaf_value)

        #randomly select features
        feat_idxs = np.random.choice(n_features, self.n_feats, replace=False)

        # greedily select the best split according to information gain
        best_feat, best_thresh = self._best_criteria(X, y, feat_idxs)

        # grow the children that result from the split
        left_idxs, right_idxs = self._split(X[:, best_feat], best_thresh)
        left = self._grow_tree(X[left_idxs, :], y[left_idxs], depth+1)
        right = self._grow_tree(X[right_idxs, :], y[right_idxs], depth+1)
        return Node(best_feat, best_thresh, left, right)

    #Selecting the best split point
    def _best_criteria(self, X, y, feat_idxs):
        best_gain = -1
        split_idx, split_thresh = None, None
        for feat_idx in feat_idxs:
            X_column = X[:, feat_idx]
            thresholds = np.unique(X_column)
            for threshold in thresholds:
                gain = self._information_gain(y, X_column, threshold)

                if gain > best_gain:
                    best_gain = gain
                    split_idx = feat_idx
                    split_thresh = threshold

        return split_idx, split_thresh

    #To calculate the information gain
    def _information_gain(self, y, X_column, split_thresh):
        #parent loss
        parent_entropy = entropy(y)

        # generate split
        left_idxs, right_idxs = self._split(X_column, split_thresh)

        if len(left_idxs) == 0 or len(right_idxs) == 0:
            return 0

        # compute the weighted avg. of the loss for the children
        n = len(y)
        n_l, n_r = len(left_idxs), len(right_idxs)
        e_l, e_r = entropy(y[left_idxs]), entropy(y[right_idxs])
        child_entropy = (n_l / n) * e_l + (n_r / n) * e_r

        # information gain is difference in loss before vs. after split
        ig = parent_entropy - child_entropy
        return ig

    #Splitting the data on the basis of split threshold
    def _split(self, X_column, split_thresh):
        left_idxs = np.argwhere(X_column <= split_thresh).flatten()
        right_idxs = np.argwhere(X_column > split_thresh).flatten()
        return left_idxs, right_idxs

    #Traversing the tree
    def _traverse_tree(self, x, node):
        #returns the value if the node is a leaf
        if node.is_leaf_node():
            return node.value
        #Traverse to the left if the feature is lesser than the threshold
        if x[node.feature] <= node.threshold:
            return self._traverse_tree(x, node.left)
        return self._traverse_tree(x, node.right)

    # Calculating the most common label
    def _most_common_label(self, y):
        counter = Counter(y)
        most_common = counter.most_common(1)[0][0]
        return most_common

In [None]:
# Generating the random samples
def bootstrap_sample(X, y):
    n_samples = X.shape[0]
    idxs = np.random.choice(n_samples, n_samples, replace=True)
    return X[idxs], y[idxs]

def most_common_label(y):
    counter = Counter(y)
    most_common = counter.most_common(1)[0][0]
    return most_common

#Random Forest Class
class RandomForest:
    #Initilizing the class
    def __init__(self, n_trees=10, min_samples_split=2,
                 max_depth=100, n_feats=None):
        self.n_trees = n_trees
        self.min_samples_split = min_samples_split
        self.max_depth = max_depth
        self.n_feats = n_feats
        self.trees = []

    # Method to fit the model
    def fit(self, X, y):
        self.trees = []
        #Generates n number of trees
        for _ in range(self.n_trees):
            tree = DecisionTree(min_samples_split=self.min_samples_split,
                max_depth=self.max_depth, n_feats=self.n_feats)
            X_samp, y_samp = bootstrap_sample(X, y)
            tree.fit(X_samp, y_samp)
            self.trees.append(tree)

    # Method to predict result from the model built
    def predict(self, X):
        tree_preds = np.array([tree.predict(X) for tree in self.trees])
        tree_preds = np.swapaxes(tree_preds, 0, 1)
        y_pred = [most_common_label(tree_pred) for tree_pred in tree_preds]
        return np.array(y_pred)

In [None]:
#Splitting the dataset into training and testing
X_train, X_test, Y_train, Y_test = train_test_split(trainDataVecs,num_data, test_size=0.2, random_state=1234)

In [None]:
#Calling the randomforest classifier
random_clf = RandomForest(n_trees=3)
random_clf.fit(X_train,Y_train)

In [None]:
#To calculate the accuracy of the model
def accuracy(y_true, y_pred):
    accuracy = np.sum(y_true == y_pred) / len(y_true)
    return accuracy

In [None]:
#Calling the predict method on testing dataset
Y_pred = random_clf.predict(X_test)

#Finding the accuracy
acc = accuracy(Y_test, Y_pred)

print ("Accuracy:", acc)

Accuracy: 0.8808988764044944


In [None]:
class LogisticRegression(object):

    def __init__(Logreg, alpha=0.01, n_iteration=100):  #This function intializes the alpha value and iteration
        Logreg.alpha = alpha                            #value in the object
        Logreg.n_iter = n_iteration

    def _sigmoid_function(Logreg, x): #This function is resonsible for calculating the sigmoid value with given parameter
        value = 1 / (1 + np.exp(-x))
        return value
    def _cost_function(Logreg,h,theta, y): # The fuctions calculates the cost value
        m = len(y)
        cost = (1 / m) * (np.sum(-y.T.dot(np.log(h)) - (1 - y).T.dot(np.log(1 - h))))
        return cost

    def _gradient_descent(Logreg,X,h,theta,y,m): # This function calculates the theta value by gradient descent
        gradient_value = np.dot(X.T, (h - y)) / m
        theta -= Logreg.alpha * gradient_value
        return theta

    def fit(Logreg, X, y): #This function primarily calculates the optimal theta value using which we predict the future data
        print("Fitting the given dataset..")
        Logreg.theta = []
        Logreg.cost = []
        X = np.insert(X, 0, 1, axis=1)
        m = len(y)
        for i in np.unique(y):
            #print('Descending the gradient for label type ' + str(i) + 'vs Rest')
            y_onevsall = np.where(y == i, 1, 0)
            theta = np.zeros(X.shape[1])
            cost = []
            for _ in range(Logreg.n_iter):
                z = X.dot(theta)
                h = Logreg._sigmoid_function(z)
                theta = Logreg._gradient_descent(X,h,theta,y_onevsall,m)
                cost.append(Logreg._cost_function(h,theta,y_onevsall))
            Logreg.theta.append((theta, i))
            Logreg.cost.append((cost,i))
        return Logreg

    def predict(Logreg, X): # this function calls the max predict function to classify the individul feauter
        X = np.insert(X, 0, 1, axis=1)
        X_predicted = [max((Logreg._sigmoid_function(i.dot(theta)), c) for theta, c in Logreg.theta)[1] for i in X ]

        return X_predicted

In [None]:
#Calling the logistic regression method on the training dataset
logi = LogisticRegression(n_iteration=30000).fit(X_train, Y_train)

Fitting the given dataset..


In [None]:
# Prediction on test set
prediction = logi.predict(X_test)
acc = accuracy(Y_test, prediction)

print ("Accuracy:", acc)

Accuracy: 0.802247191011236
