# $$\text{NLP: Natural Language Processing}$$


# OBJECTIVES:
# 1. Tokens/Bag of Words
# 2. Text Normalization: stemming/lemmatizing, stop words, punctuation/whitespace
# 3. TF-IDF
# 4. Cosine Similarity

<br>

## BONUS (https://www.youtube.com/watch?v=6zm9NC9uRkk):
## 1. spacy
## $\quad$ a. Entity Identification
## $\quad$ b. Part of Speech Tagging
## $\quad$ c. Text "Normalization"
## 2. gensim 
## $\quad$ a. Phrase Modeling
## $\quad$ b. Topic Modeling
## $\quad$ c. Word2Vec
## 2. textacy
## $\quad$ "C". is for "Charley"

In [None]:
from pymongo import MongoClient
client = MongoClient()
db = client.nyt_dump
coll = db.articles

In [None]:
def unicodetoascii(text):# \xe2\x80\x9c#replace(u'\xe2\xad',"").
    TEXT = text.replace(u'\xe2\x80\x99', "'").replace(u'\xc3\xa9', 'e').replace(u'\xe2\x80\x90', '-').replace(u'\xe2\x80\x91', '-').replace(u'\xe2\x80\x92', '-').replace(u'\xe2\x80\x93', '-').replace(u'\xe2\x80\x94', '-').replace(u'\xe2\x80\x94', '-').replace(u'\xe2\x80\x98', "'").replace(u'\xe2\x80\x9b', "'").replace(u'\xe2\x80\x9c', '"').replace(u'\xe2\x80\x9c', '"').replace(u'\xe2\x80\x9d', '"').replace(u'\xe2\x80\x9e', '"').replace(u'\xe2\x80\x9f', '"').replace(u'\xe2\x80\xa6', '...').replace(u'\xe2\x80\xb2', "'").replace(u'\xe2\x80\xb3', "'").replace(u'\xe2\x80\xb4', "'").replace(u'\xe2\x80\xb5', "'").replace(u'\xe2\x80\xb6', "'").replace(u'\xe2\x80\xb7', "'").replace(u'\xe2\x81\xba', "+").replace(u'\xe2\x81\xbb', "-").replace(u'\xe2\x81\xbc', "=").replace(u'\xe2\x81\xbd', "(").replace(u'\xe2\x81\xbe', ")")
    return TEXT

In [None]:
documents = [unicodetoascii(' '.join(article['content']).lower()) for article in coll.find()]

In [None]:
documents = [d for d in documents if len(d) > 0]

In [None]:
print len(documents)
documents[2] 

In [None]:
import nltk
#nltk.download('punkt')

In [None]:
tokenized_docs = []
for document in documents:
    tokenized_docs.append(nltk.word_tokenize(document))

In [None]:
# http://text-processing.com/demo/tokenize/
tokenized_docs[2][:10]

# Vectorizing "documents" as a *Bag of Words*
# $ 
\begin{array}{|c|c|c|c|c|}
\hline
& \text{token 1} &  \text{token 2} & \cdots &\text{token p} \\ \hline 
\text{document 1} &&&& \\ \hline
\text{document 2} &&&& \\ \hline
\vdots &&&& \\ \hline
\text{document n} &&&& \\ \hline
\end{array}
$

# How big is $n$?


# How big is $p$?

# Addressing $p$:
# - stop words 
# - punctuation/whitespace
# - stemming/lemmatizing

In [None]:
from nltk.corpus import stopwords
#nltk.download('stopwords')
sw = set(stopwords.words('english'))

In [None]:
sw

In [None]:
toc_doc = []
for document in tokenized_docs:
    doc = []
    for token in document:
        if token not in sw:
            doc.append(token)
    toc_doc.append(doc)
    
tokenized_docs = toc_doc    

In [None]:
import string
sp = set(string.punctuation)
sp.add('``')
sp.add("''")

toc_doc = []
for document in tokenized_docs:
    doc = []
    for token in document:
        if token not in sp:
            doc.append(token)
    toc_doc.append(doc)
    
tokenized_docs = toc_doc   

In [None]:
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
#nltk.download('wordnet')

porter = PorterStemmer()
snowball = SnowballStemmer('english')
wordnet = WordNetLemmatizer()

In [None]:
import copy
tokenized_docs_porter = copy.deepcopy(tokenized_docs)
tokenized_docs_snowball = copy.deepcopy(tokenized_docs)
tokenized_docs_lemmatize = copy.deepcopy(tokenized_docs)
for document in range(len(tokenized_docs)):
    for token in range(len(tokenized_docs[document])):
        tokenized_docs_porter[document][token] = porter.stem(tokenized_docs[document][token])
        tokenized_docs_snowball[document][token] = snowball.stem(tokenized_docs[document][token])
        tokenized_docs_lemmatize[document][token] = wordnet.lemmatize(tokenized_docs[document][token])

In [None]:
tokenized_docs[2]

In [None]:
documents[2]

In [None]:
tokenized_docs[2][:10]

In [None]:
tokenized_docs_porter[2][:10]

In [None]:
tokenized_docs_snowball[2][:10]

In [None]:
tokenized_docs_lemmatize[2][:10]

In [None]:
vocab = list(set([t for d in tokenized_docs_porter for t in d ]))

In [None]:
vocab = {v:i for i,v in enumerate(vocab)}

In [None]:
import numpy as np
TF = np.zeros([len(tokenized_docs_porter), len(vocab)])

In [None]:
for d in range(len(tokenized_docs_porter)):
    for t in range(len(tokenized_docs_porter[d])):
        TF[d,vocab[tokenized_docs_porter[d][t]]] += 1

# Vocabulary, Corpus
# Term Frequency, Presence/Absence, Document Frequency
# $ 
\begin{array}{|c|c|c|c|c|}
\hline
& \text{token 1} &  \text{token 2} & \cdots &\text{token p} \\ \hline 
\text{document 1} &&&& \\ \hline
\text{document 2} &&&& \\ \hline
\vdots &&&& \\ \hline
\text{document n} &&&& \\ \hline
\end{array}
$

<br>
<font color="red">
# What makes two documents similar?
<br>

<img src="stuff/Image530.gif",width=500px, align="left", Image529.gif>



# Inverse Document Frequencing Weighting

# $$TF\text{-}IDF_{ij} = \frac{TF_{ij}}{\log\left(1 + \frac{n}{1+DF_{i}}\right)} $$



# Normalizing

# $$^*TF\text{-}IDF_{ij} = \frac{TF\text{-}IDF_{ij}}{\sum_{j=1}^p TF\text{-}IDF_{ij}}$$

# $$^{**}TF\text{-}IDF_{ij} = \frac{TF\text{-}IDF_{ij}}{\sqrt{\sum_{j=1}^p TF\text{-}IDF_{ij}^2}}$$




In [None]:
DF = (TF>0).sum(axis=0)#/len(TF)
DF.shape
# plt.hist(DF)
# DF.min()

In [None]:
TF_IDF = TF*(1+np.log(len(TF)/(0.+DF)))

In [None]:
np.log(999./7)

In [None]:
for tf in range(len(TF_IDF)):
    TF_IDF[tf] = TF_IDF[tf]/np.sqrt(np.sum(TF_IDF[tf]**2))

# The dot product of two vectors $X_1$ and $X_2$ depends on their magnitude and the angle $\theta$ between them:
# $$ \frac{X_1 \cdot X_2}{||X_1|| \; ||X_2||} = Cos(\theta)$$

# *Cosine Similarity* measures if two vectors point in similar directions:
# The angle between two vectors is $\quad\quad\quad$ if they point in similar directions.
# The angle between two vectors is $\quad\quad\quad$ if they point in dissimilar directions.
# The cosine similarity between two vectors is $\quad\quad\quad$ if they point in similar directions.
# The cosine similarity between two vectors is $\quad\quad\quad$ if they point in dissimilar directions.
# The maximum angle between *vectorized documents is* $\quad\quad\quad$.



In [None]:
TF_IDF.dot(TF_IDF.T)[2].argsort()

In [None]:
documents[782]

# sklearn

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
vect = CountVectorizer(stop_words='english')
word_counts = vect.fit_transform(documents)

In [None]:
def tokenize(doc):
    '''
    INPUT: string
    OUTPUT: list of strings

    Tokenize and stem/lemmatize the document.
    '''
    tokenized_doc = nltk.word_tokenize(doc)

    sw = set(stopwords.words('english'))
    
    toc_doc = []
    for token in tokenized_doc:
        if token not in sw:
            toc_doc.append(token)
    tokenized_doc = toc_doc  

    sp = set(string.punctuation)
    sp.add('``')
    sp.add("''")

    toc_doc = []
    for token in tokenized_doc:
        if token not in sp:
            toc_doc.append(token)
    tokenized_doc = toc_doc       

    tokenized_doc_porter = copy.deepcopy(tokenized_doc)
    for token in range(len(tokenized_doc)):
        tokenized_doc_porter[token] = porter.stem(tokenized_doc[token])
    
    return tokenized_doc_porter

vect = CountVectorizer(tokenizer=tokenize)
word_counts = vect.fit_transform(documents)

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
j=1
plt.plot([TF[j][vocab[i]] for i in vect.get_feature_names()],word_counts.toarray()[j],'.') 

In [None]:
vect = TfidfVectorizer(tokenizer=tokenize, smooth_idf=False)
word_counts = vect.fit_transform(documents)

In [None]:
j=1
plt.plot([TF_IDF[j][vocab[i]] for i in vect.get_feature_names()],word_counts.toarray()[j],'.') 
plt.plot([0,.5],[0,.5])

# spacy

In [None]:
# conda install spacy
# pip install spacy && python -m spacy.en.download
import spacy

In [None]:
nlp = spacy.load('en')

In [None]:
parsed_doc = nlp(documents[2])
parsed_doc

In [None]:
for i,s in enumerate(parsed_doc.sents):
    print i
    print s

In [None]:
for i,e in enumerate(parsed_doc.ents):
    print i, e, ": (", e.label_, ")"

In [None]:
import pandas as pd

text = [token.orth_ for token in parsed_doc]
post = [token.pos_ for token in parsed_doc]

pd.DataFrame(zip(text, post))

In [None]:
ent_type = [token.ent_type_ for token in parsed_doc]
ent_iob = [token.ent_iob_ for token in parsed_doc]

pd.DataFrame(zip(text, ent_type, ent_iob))

In [None]:
lemma = [token.lemma_ for token in parsed_doc]
shape = [token.shape_ for token in parsed_doc]

pd.DataFrame(zip(text, lemma, shape))

In [None]:
stuffs = [(token.orth_, token.prob, token.is_stop, token.is_punct, token.is_space, token.like_num, token.is_oov) for token in parsed_doc]
ent_iob = [token.ent_iob_ for token in parsed_doc]

pd.DataFrame(stuffs, columns=["orth","log prob","is_stop","is_punct","is_space","like_num","is_oov"])

# gensim 
## Phrase Modeling

In [None]:
# Phrase Modeling e.g., "Happy Hour"

In [None]:
from gensim.models import Phrases
from gensim.models.word2vec import LineSentence

In [None]:
#is_stop
def norm_lem():
    for doc in documents:
        parsed_doc = nlp(doc)
        for sent in parsed_doc.sents:
            yield u' '.join([token.lemma_ for token in sent if not token.is_punct and not token.is_space])         

In [None]:
import codecs

In [None]:
with codecs.open("LineSentenceFile.txt", 'w', encoding='utf_8') as f:
    for s in norm_lem():
        f.write(s + "\n")

In [None]:
import itertools as it
for s in it.islice(LineSentence("LineSentenceFile.txt"),230,233):
    print u' '.join(s)

In [None]:
bigram_model = Phrases(LineSentence("LineSentenceFile.txt"))

In [None]:
bigram_model.save("BigramModel.txt")
with codecs.open("BigramSentenceFile.txt", 'w', encoding='utf_8') as f:
    for s in LineSentence("LineSentenceFile.txt"):
        f.write(u' '.join(bigram_model[s]) + "\n")

In [None]:
trigram_model = Phrases(LineSentence("BigramSentenceFile.txt"))

In [None]:
trigram_model.save("TrigramModel.txt")
with codecs.open("TrigramSentenceFile.txt", 'w', encoding='utf_8') as f:
    for s in LineSentence("BigramSentenceFile.txt"):
        f.write(u' '.join(trigram_model[s]) + "\n")

In [None]:
import itertools as it
for s in it.islice(LineSentence("TrigramSentenceFile.txt"),139,146):
    print u' '.join(s)

In [None]:
with codecs.open("Trigram_documents.txt", 'w', encoding='utf_8') as f:
    for doc in [d for d in documents if len(d) > 0]:
        parsed_doc = nlp(doc)
        tri = trigram_model[bigram_model[[token.lemma_ for token in parsed_doc if not token.is_punct and not token.is_space]]]         
        f.write(u' '.join([t for t in tri if t not in spacy.en.STOPWORDS])+'\n')

# gensim 
## Topic Modeling

In [None]:
# topic modeling
from gensim.corpora import Dictionary, MmCorpus
from gensim.models.ldamulticore import LdaMulticore

#pip install pyldavis
import pyLDAvis
import pyLDAvis.gensim
import warnings
#from cPickle import pickle

In [None]:
TrigramDictionary = Dictionary(LineSentence("Trigram_documents.txt"))
TrigramDictionary.filter_extremes(no_below=10, no_above=.4)
TrigramDictionary.filter_n_most_frequent(300)
TrigramDictionary.compactify()
#TrigramDictionary.save("TrigramDictionary.txt")
#TrigramDictionary = Dictionary.load("TrigramDictionary.txt")

In [None]:
def bow():
    for doc in LineSentence("Trigram_documents.txt"):
        yield TrigramDictionary.doc2bow(doc)

In [None]:
MmCorpus.serialize("Trigram_documents_bow.txt", bow())

In [None]:
lda = LdaMulticore(MmCorpus("Trigram_documents_bow.txt"), num_topics=6, workers=1, id2word=TrigramDictionary)

In [None]:
# explore topics
for term, freq in lda.show_topic(0,10):
    print term, ":", round(freq, 3)

In [None]:
lda_vis = pyLDAvis.gensim.prepare(lda, MmCorpus("Trigram_documents_bow.txt"), TrigramDictionary)

In [None]:
pyLDAvis.display(lda_vis)

In [None]:
d = 2
print lda[list(bow())[d]]
print u' '.join(list(LineSentence("Trigram_documents.txt"))[d])
documents[d]

# gensim 
## Word2Vec

In [None]:
from gensim.models import Word2Vec

In [None]:
myW2V = Word2Vec(LineSentence("Trigram_documents.txt"), size=20, window=5, min_count=3, sg=1, seed=0)
for i in range(5):
    myW2V.train(LineSentence("Trigram_documents.txt"))

myW2V.init_sims()
myW2V.train_count

In [None]:
sorted([(t, v.count) for t, v in myW2V.vocab.iteritems()], key = lambda x: -x[1])[:100]

In [None]:
# 1:16:42
# vectors

In [None]:
myW2V.most_similar(positive=['child'], topn=5)

In [None]:
myW2V.most_similar(positive=[u'country'], topn=5)

In [None]:
# word algebra

In [None]:
myW2V.most_similar(positive=[u'united_states',u'middle_east'], negative=[u'democracy'], topn=1)

In [None]:
myW2V.most_similar(positive=[u'united_states'], negative=[u'food'], topn=1)

In [None]:
myW2V.most_similar(positive=[u'food',u'russia'], topn=1)

In [None]:
myW2V.most_similar(positive=[u'united_states',u'election'], topn=1)

In [None]:
from sklearn.manifold import TSNE

In [None]:
ins = pd.DataFrame(myW2V.syn0, index=[t for t, v in myW2V.vocab.iteritems()]).drop(spacy.en.STOPWORDS, errors='ignore')
ins = ins[:1000]
tsne = TSNE()
tsne_vects = tsne.fit_transform(ins.values)
outs = pd.DataFrame(tsne_vects, index=pd.Index(ins.index), columns=['tsne_x','tsne_y'])
outs['word'] = outs.index

In [None]:
from bokeh.plotting import figure, show, output_notebook
from bokeh.models import HoverTool, ColumnDataSource, value
output_notebook()

In [None]:
plot_data = ColumnDataSource(outs)
tsne_plot = figure(title="tsne word encodings",
                   plot_width=800, plot_height=800,
                   tools=('pan, wheel_zoom, box_zoom, box_select, resize, reset'),
                   active_scroll='wheel_zoom')

tsne_plot.add_tools( HoverTool(tooltips = '@word') )
tsne_plot.circle('tsne_x','tsne_y', source=plot_data, color='blue', line_alpha=.2, fill_alpha=.1,
                 size=10, hover_line_color='black')

tsne_plot.title.text_font_size = value('16pt')
tsne_plot.xaxis.visible = False
tsne_plot.yaxis.visible = False
tsne_plot.grid.grid_line_color = None
tsne_plot.outline_line_color = None
show(tsne_plot)

<img src="stuff/bayes.tiff",width=1000px, align='left'>

# OBJECTIVES:
# 1. NB4NLP (Naive Bayes for Natural Language Processing)
<br>
# Bonus:

# 1. Generative Modeling
# 2. Why NB4NLP?
# $\quad$ a. p>>n covariance matrix estimation
# $\quad$ b. Computation




# NB4NLP

# $ 
\begin{array}{|c|c|c|c|c|c|}
\hline
& \text{token 1} &  \text{token 2} & \cdots &\text{token p}& \textbf{Class Label } (Y) \\ \hline 
\text{document 1} &&&& &\\ \hline
\text{document 2} &&&& &\\ \hline
\vdots &&&& &\\ \hline
\text{document n} &&&& &\\ \hline
\end{array}
$


# $$ 
\begin{align*}
\textbf{X}_0 &= \text{"$\textbf{X}_0$ is a 'document' that..."} \\
&= (t_1, t_2, \cdots, t_K)\\
&\\
Pr(Y_0=y_0|\textbf{X}_0) &= \frac{Pr(\textbf{X}_0|Y_0=y_0)Pr(Y_0=y_0)}{Pr(\textbf{X}_0)}\\
&\propto  Pr(\textbf{X}_0|Y_0=y_0)Pr(Y_0=y_0)\\
{}&\\
Pr(Y_0=y_0) &= \sum_{i: Y_i=y_0}\frac{1}{n}\\
Pr(\textbf{X}_0|Y_0=y_0) &= \prod_{k=1}^{|\textbf{X}_0|}Pr(X_{0k}|Y_0=y_0) \\
&= \prod_{k=1}^{|\textbf{X}_0|} \left( \sum_{ \underset{\text{token}_{j}=X_{0k}}{\overset{i,j: Y_i=y_0}{}} }TF_{ij} \right)/\left( \sum_{ \underset{\;}{\overset{i,j: Y_i=y_0}{}} }TF_{ij} \right)
\end{align*}
$$
# What is the assumption here?

# What if $TF_{ij}$ = 0?
# Laplace Smoothing: 
# $$ \prod_{k=1}^{|\textbf{X}_0|} \left( \sum_{ \underset{\text{token}_{j}=X_{0k}}{\overset{i,j: Y_i=y_0}{}} }(TF_{ij}+\alpha) \right)/\left( \sum_{ \underset{\;}{\overset{i,j: Y_i=y_0}{}} }(TF_{ij} +\alpha) \right) $$

# How do we get rid of the *"proportional to"*?
# $$
\begin{align*}
Pr(Y_0=y_0|\textbf{X}_0) &= \frac{Pr(\textbf{X}_0|Y_0=y_0)Pr(Y_0=y_0)}{\underset{y}{\sum} Pr(\textbf{X}_0|Y_0=y)Pr(Y_0=y)}\\
&= \frac{\left(\prod_{k=1}^{|\textbf{X}_0|} p_k^{y_0}\right)Pr(Y_0=y_0)}{\underset{y}{\sum} \left(\prod_{k=1}^{|\textbf{X}_0|} p_k^y\right)Pr(Y_0=y)}
\end{align*}$$


# What about multiplying so many small probabilities together?
# $$ \log\left(\prod_{k=1}^{|\textbf{X}_0|} p_k\right) = \sum_{k=1}^{|\textbf{X}_0|}  \log p_k $$

# $$
\begin{align*}
Pr(Y_0=y_0|\textbf{X}_0) &= \frac{\exp\left(\sum_{k=1}^{|\textbf{X}_0|}  \log p_k^{y_0} \right) Pr(Y_0=y_0)}{\underset{y}{\sum} \exp\left(\sum_{k=1}^{|\textbf{X}_0|}  \log p_k^{y} \right) Pr(Y_0=y)}
\end{align*}$$

# Okay but this is still a really small number that we probably can't represent...
# $$ Pr(\textbf{X}_0|Y_0=y) = \exp\left(\sum_{k=1}^{|\textbf{X}_0|}  \log p_k^y \right) $$

# But we can scale it!
# $$
\begin{align*}
Pr(Y_0=y_0|\textbf{X}_0) &= \frac{\exp(C) \exp\left(\sum_{k=1}^{|\textbf{X}_0|}  \log p_k^{y_0} \right) Pr(Y_0=y_0)}{\underset{y}{\sum}\exp(C) \exp\left(\sum_{k=1}^{|\textbf{X}_0|}  \log p_k^{y} \right) Pr(Y_0=y)}\\
&= \frac{exp\left(C+\sum_{k=1}^{|\textbf{X}_0|}  \log p_k^{y_0} \right) Pr(Y_0=y_0)}{\underset{y}{\sum}  \exp\left(C+ \sum_{k=1}^{|\textbf{X}_0|}  \log p_k^{y} \right) Pr(Y_0=y)}
\end{align*}$$

# Discriminitive or Predictive (Conditional) Model:
# $$ f(Y_0|\textbf{X}_0)$$

# Generative (Joint) Model
# $$ f(\textbf{X}_i,Y_i) \quad \sim \quad f(\textbf{X}_i|Y_i) $$



# NB4NLP
# $$ 
\begin{align*}
Pr(\textbf{X}_i^*|Y_i=y) &= Multinomial(\hat \pi_y)\\
 &= f_{\hat \pi_y}(\textbf{X}_i^*)\\
Pr(Y_i=y) &= \hat w_y\\
Pr(\textbf{X}_i^*,Y_i) &= \sum_y \hat w_y f_{\hat \pi_y}(\textbf{X}_i^*) \\
\end{align*}$$

# $$ 
\begin{align*}
\textbf{X}_0 &= \text{"$\textbf{X}_0$ is a 'document' that..."} \\
&= (t_1, t_2, \cdots, t_K)\\
\textbf{X}_0^* &= (TF_{01}^*, TF_{02}^*, \cdots, TF_{0p}^*)\\
{}\\
\textbf{X}_i &= (X_{i1}, X_{i2}, \cdots, X_{ip})\\
\end{align*}$$

# NB4continuousfeatures
# $$ 
\begin{align*}
Pr(\textbf{X}|Y=y) &= MVN(\hat \mu_y, \hat \Sigma_y)\\
 &= f_{\hat \mu_y, \hat \Sigma_y}(\textbf{X})\\
Pr(Y=y) &= \hat w_y\\
Pr(\textbf{X},Y=y) &= \sum_y \hat w_y f_{\hat \mu_y, \hat \Sigma_y}(\textbf{X})
\end{align*}$$


<img src="stuff/mixture.png",width=500px, align='center'>


# $$
\left[\begin{array}{cccc}
X_{11}&X_{12} & \cdots & X_{1p} \\
X_{21}&X_{22} & \cdots & X_{2p} \\
\vdots &\vdots &\ddots&\vdots \\\hline
X_{i1}&X_{i2} & \cdots & X_{ip} \\ \hline
\vdots &\vdots &\ddots&\vdots \\
X_{n1}&X_{n2} & \cdots & X_{np} \\
\end{array}\right]
$$

# $$ {\boldsymbol X}_i = (X_{i1}, X_{i2}, \cdots, X_{ip})^T \sim MVN({\boldsymbol \mu}_p,\Sigma_{p \times p})$$ 


# $${\boldsymbol X}_i = \left(\begin{array}{c} {X_{i1}} \\ {X_{i2}} \\ \vdots\\ {X_{ip}} \end{array} \right) \sim MVN\left(
 \left[\begin{array}{c} \mu_{{X_{i1}}} \\ \mu_{{X_{i2}}} \\ \vdots\\ \mu_{{X_{ip}}} \end{array} \right]_,
\left[\begin{array}{cccc}\sigma^2_{{X_{i1}}}&\sigma_{{X_{i1}X_{i2}}} & \cdots & \sigma_{{X_{i1}X_{ip}}}\\ 
\sigma_{{X_{i2}X_{i1}}} &\sigma^2_{{X_{i2}}}&  \cdots & \sigma_{{X_{i2}X_{ip}}}\\
\vdots &\vdots&  \ddots & \vdots\\
\sigma_{{X_{ip}X_{i1}}} & \sigma_{{X_{ip}X_{i2}}} &   \cdots & \sigma^2_{{X_{ip}}}\\ \end{array}\right]\right)$$

# Can't estimate the covariance matrix above when $p>n$... so instead we'll use:

# $${\boldsymbol X}_i = \left(\begin{array}{c} {X_{i1}} \\ {X_{i2}} \\ \vdots\\ {X_{ip}} \end{array} \right) \sim MVN\left(
 \left[\begin{array}{c} \mu_{{X_{i1}}} \\ \mu_{{X_{i2}}} \\ \vdots\\ \mu_{{X_{ip}}} \end{array} \right]_,
\left[\begin{array}{cccc}\sigma^2_{{X_1}}&\;\;\;0\;\;\;& \cdots & \;\;\;0\;\;\;\\ 
0&\sigma^2_{{X_2}}&  \cdots & 0\\
\vdots &\vdots&  \ddots & \vdots\\
\;\;\;\;0\;\;\;\; & 0 &   \cdots & \sigma^2_{{X_p}}  \\ \end{array}\right]\right)$$



<img src="stuff/nb.tiff",width=1000px, align='center'>

<table>
<tr>
<td><img src="stuff/mix2b.png",width=600px, align='center'></td>
<td><img src="stuff/mix2.png",width=600px, align='center'></td>
</table>

# Highly correlated features will ruin classification
# Probability estimates are not particularly reliable
# Less naive methodologies will outperform NB
# But NB is FAST and can give acceptable results
# Especially in HUGE data sets where it's the only way

In [None]:
# Austin: NLP
# 
# onsite info collected

# Noah: Product Hunt