## Loading modules, data and preprocessing

In [None]:
import sklearn 
# Import all of the scikit learn stuff 
from __future__ import print_function 
from sklearn.decomposition import TruncatedSVD 
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.preprocessing import Normalizer 
from sklearn import metrics 
from sklearn.cluster import KMeans, MiniBatchKMeans
import pandas as pd
import numpy as np

In [None]:
import glob, os
os.chdir('sample_data/') #change directory to where the folders are
folders = glob.glob('*') #load all the folder names into a list
# print(folders)

all_texts = []
all_categories = []

for folder in folders:
    print('importing text files from "{}" folder...'.format(folder), end=' ')
    
    files_in_folder = glob.glob(folder+'/*.txt')
    
    for _file_ in files_in_folder:
        with open(_file_, 'r', encoding='latin-1') as f:
            text_in_file = f.read()
            all_texts.append(text_in_file)
            all_categories.append(folder)
            
    print('found {} files'.format(len(files_in_folder)))
        
os.chdir('../') #revert back to original working directory

In [None]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
import re

stopwords = nltk.corpus.stopwords
eng_stopwords = stopwords.words('english')
wordnet_lemmatizer = WordNetLemmatizer()

def basic_preprocessing(text):
    text = text.lower() #lowering
    text = re.sub(r'\[.*?\]', '', text) #removing all instances of citation brackets found in wiki articles
    text = word_tokenize(text)
    text = [word for word in text if word not in eng_stopwords] #removing stop words
    text = [word for word in text if len(word) > 1] #removing single character tokens
#     text = [wordnet_lemmatizer.lemmatize(word) for word in text]

    return(text)
processed_texts = [basic_preprocessing(text) for text in all_texts]

## Creating the TFIDF Matrix of data

In [None]:
x = TfidfVectorizer()
x.fit(all_texts)
all_texts_summary = [text[:20] for text in all_texts]
tfidf = x.transform(all_texts)
pd.DataFrame(tfidf.todense(), index=all_texts_summary, columns=x.get_feature_names())

## Applying SVD

In [None]:
n_topics = 19
lsa = TruncatedSVD(n_topics, algorithm = 'arpack')
lsa.fit(tfidf)
lsa_data = lsa.transform(tfidf)

### Printing concept-word matrix

In [None]:
concepts = ['concept{}'.format(i) for i in range(n_topics)]
pd.DataFrame(lsa.components_, columns=x.get_feature_names(), index=concepts)

### Printing document-concept matrix

In [None]:
pd.DataFrame(lsa_data, index = all_texts_summary, columns=concepts)


### Obtaining document-document similarities

In [None]:
matrix_similarity = np.asarray(np.asmatrix(lsa_data) * np.asmatrix(lsa_data).T) 
pd.DataFrame(matrix_similarity,index=all_texts_summary, columns=all_texts_summary)

## Applying LSA using gensim module

In [None]:
from gensim import corpora, models

## Creating all {index:word} relations
dictionary = corpora.Dictionary(processed_texts)

## Converting corpus to a list of indices
corpus = [dictionary.doc2bow(text) for text in processed_texts]

## Initializing TFIDF parameters from corpus
tfidf = models.TfidfModel(corpus)

## Creating TFIDF Matrix from data
corpus_tfidf = tfidf[corpus]

## Creating LSA model on the tfidf
lsi = models.LsiModel(corpus_tfidf, id2word = dictionary, num_topics = 10)

In [None]:
print(corpus_tfidf.obj)

In [None]:
lsi.print_topics(10)

## Converting each document to it's concept space and using the new vectors for classification

In [None]:
lsi_corpus = []
for lsi_doc in lsi[corpus]:
    lsi_corpus.append([topic_component[1] for topic_component in lsi_doc])
import numpy as np
lsi_corpus = np.array(lsi_corpus)
print(lsi_corpus.shape)


from sklearn.naive_bayes import BernoulliNB
nb_model = BernoulliNB()
nb_model.fit(lsi_corpus, all_categories)
nb_model.predict(lsi_corpus)