# Text Analysis (NLP)

Taken from [http://openonlinecourses.com/causalanalysis/TextAnalysis.asp](http://openonlinecourses.com/causalanalysis/TextAnalysis.asp).

## Crawl the directory of data

In [1]:
import pathlib

csv_files = list(pathlib.Path('./CSV.Sentiment').glob('*.csv'))
len(csv_files)

25

## Build maps

These maps will help us map back and forth between the files and data.

In [2]:
import pandas as pd

def clean_file_name(file_path):
    stem = file_path.stem
    stem = stem.replace('+', ' ')
    stem = stem.replace('_', '')
    stem = stem.replace('.', '')
    stem = stem.lower()
    stem = stem.strip()
    return stem

# file-to-id
f2i = {str(p): i for i, p in enumerate(csv_files)}

# id-to-file
i2f = {v: k for k, v in f2i.items()}

# file-to-sentence
f2s = {f2i[str(p)]: clean_file_name(p) for p in csv_files}

# file-to-data
f2d = {f2i[str(p)]: pd.read_csv(p)[['comment', 'classification']].assign(comment=lambda d: d['comment'].str.lower()) 
       for p in csv_files}

## Vectorization

Let's create vector space models `VSMs` for each one of these corpus of documents.

In [21]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

def get_vsm(df, vectorizer_type='count'):
    text = df['comment']
    
    if 'count' == vectorizer_type:
        vectorizer = CountVectorizer(max_features=100)
    else:
        vectorizer = TfidfVectorizer(max_features=100)
    
    vectorizer.fit(text)
    X = vectorizer.transform(text).todense()

    count_df = pd.DataFrame(X, columns=vectorizer.get_feature_names_out())
    count_df['__y'] = df['classification']
    
    return {'data': count_df, 'vectorizer': vectorizer}

count_vsm = {k: get_vsm(df, vectorizer_type='count') for k, df in f2d.items()}
tfidf_vsm = {k: get_vsm(df, vectorizer_type='tfidf') for k, df in f2d.items()}

## Learn models

Let's learn a classification model (e.g. Logistic Regression) for each of the VSMs types.

In [22]:
import numpy as np
from sklearn.linear_model import LogisticRegression

def get_model(df):
    X = df[[c for c in df.columns if c != '__y']]
    y = np.ravel(df['__y'])
    
    model = LogisticRegression(random_state=37, n_jobs=-1, solver='saga', max_iter=5_000)
    model.fit(X, y)
    
    return model

count_models = {k: get_model(v['data']) for k, v in count_vsm.items()}
tfidf_models = {k: get_model(v['data']) for k, v in tfidf_vsm.items()}

In [32]:
def do_predict(fid):
    s = f2s[fid]
    print(s)
    
    count_v = count_vsm[fid]['vectorizer']
    tfidf_v = tfidf_vsm[fid]['vectorizer']
    
    count_s = count_v.transform([s]).todense()
    tfidf_s = tfidf_v.transform([s]).todense()
    
    count_m = count_models[fid]
    tfidf_m = tfidf_models[fid]
    
    count_c = count_vsm[fid]['data'].columns
    tfidf_c = tfidf_vsm[fid]['data'].columns
    
    print(count_c)
    
    count_p = count_m.predict_proba(count_s)
    tfidf_p = tfidf_m.predict_proba(tfidf_s)
    
    print(s, count_p, tfidf_p)
    
do_predict(0)

a very uneventful experience when it came down to pain
Index(['about', 'after', 'all', 'always', 'am', 'amazing', 'an', 'and', 'any',
       'are',
       ...
       'went', 'were', 'what', 'when', 'with', 'wonderful', 'would', 'years',
       'you', '__y'],
      dtype='object', length=101)
a very uneventful experience when it came down to pain [[0.84691516 0.15308484]] [[0.65470883 0.34529117]]


