# Text Analysis (NLP)

Taken from [http://openonlinecourses.com/causalanalysis/TextAnalysis.asp](http://openonlinecourses.com/causalanalysis/TextAnalysis.asp).

## Crawl the directory of data

In [1]:
import pathlib

csv_files = list(pathlib.Path('./CSV.Sentiment').glob('*.csv'))
len(csv_files)

25

## Build maps

These maps will help us map back and forth between the files and data.

In [2]:
import pandas as pd

def clean_file_name(file_path):
    stem = file_path.stem
    stem = stem.replace('+', ' ')
    stem = stem.replace('_', '')
    stem = stem.replace('.', '')
    stem = stem.lower()
    stem = stem.strip()
    return stem

# file-to-id
f2i = {str(p): i for i, p in enumerate(csv_files)}

# id-to-file
i2f = {v: k for k, v in f2i.items()}

# file-to-sentence
f2s = {f2i[str(p)]: clean_file_name(p) for p in csv_files}

# file-to-data
f2d = {f2i[str(p)]: pd.read_csv(p)[['comment', 'classification']].assign(comment=lambda d: d['comment'].str.lower()) 
       for p in csv_files}

## Vectorization

Let's create vector space models `VSMs` for each one of these corpus of documents.

In [3]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

def get_vsm(df, vectorizer_type='count'):
    text = df['comment']
    
    if 'count' == vectorizer_type:
        vectorizer = CountVectorizer(max_features=100)
    else:
        vectorizer = TfidfVectorizer(max_features=100)
    
    vectorizer.fit(text)
    X = vectorizer.transform(text).todense()

    count_df = pd.DataFrame(X, columns=vectorizer.get_feature_names())
    count_df['__y'] = df['classification']
    
    return {'data': count_df, 'vectorizer': vectorizer}

count_vsm = {k: get_vsm(df, vectorizer_type='count') for k, df in f2d.items()}
tfidf_vsm = {k: get_vsm(df, vectorizer_type='tfidf') for k, df in f2d.items()}

## Learn models

Let's learn a classification model (e.g. Logistic Regression) for each of the VSMs types.

In [4]:
import numpy as np
from sklearn.linear_model import LogisticRegression

def get_model(df):
    X = df[[c for c in df.columns if c != '__y']]
    y = np.ravel(df['__y'])
    
    model = LogisticRegression(random_state=37, n_jobs=-1, solver='saga', max_iter=5_000)
    model.fit(X, y)
    
    return model

count_models = {k: get_model(v['data']) for k, v in count_vsm.items()}
tfidf_models = {k: get_model(v['data']) for k, v in tfidf_vsm.items()}

## Do predictions

In [5]:
def do_predict(fid):
    s = f2s[fid]
    f = i2f[fid]
    
    count_v = count_vsm[fid]['vectorizer']
    tfidf_v = tfidf_vsm[fid]['vectorizer']
    
    count_s = count_v.transform([s]).todense()
    tfidf_s = tfidf_v.transform([s]).todense()
    
    count_m = count_models[fid]
    tfidf_m = tfidf_models[fid]
    
    count_c = count_vsm[fid]['data'].columns
    tfidf_c = tfidf_vsm[fid]['data'].columns
    
    count_p = count_m.predict_proba(count_s)[0,1]
    tfidf_p = tfidf_m.predict_proba(tfidf_s)[0,1]
    
    return {
        'file': f,
        'sentence': s,
        'count_p': count_p,
        'tfidf_p': tfidf_p
    }
    
result_df = pd.DataFrame([do_predict(i) for i in range(len(count_models))])
result_df.shape

(25, 4)

In [6]:
result_df

Unnamed: 0,file,sentence,count_p,tfidf_p
0,CSV.Sentiment/Please+go+find+anyone+else+but+D...,please go find anyone else but dr haque and hi...,0.137457,0.238707
1,CSV.Sentiment/__and+then+never+looked+me+in+th...,"and then never looked me in the eye again, pre...",0.561817,0.396794
2,CSV.Sentiment/The+medicine+did+not+taste+terri...,the medicine did not taste terrible,0.61434,0.729197
3,CSV.Sentiment/Go+elsewhere+for+treatment!!__.csv,go elsewhere for treatment!!,0.547714,0.638849
4,"CSV.Sentiment/Unprofessional,+Rude,+and+a+sham...","unprofessional, rude, and a shame to the medic...",0.999406,0.998494
5,"CSV.Sentiment/Granted,+I+am+not+very+far+into+...","granted, i am not very far into healing so thi...",0.072228,0.109858
6,CSV.Sentiment/The+follow-up+care+of+Dr.+Wages+...,the follow-up care of dr wages and his staff a...,0.127893,0.177071
7,CSV.Sentiment/His+office+nurse+Arri+is+so+unpr...,his office nurse arri is so unprofessional and...,0.025998,0.04512
8,CSV.Sentiment/Great+experience-+very+friendly+...,great experience- very friendly and prompt plu...,0.000726,0.004252
9,CSV.Sentiment/I+wish+I+could+select+a+lot+more...,i wish i could select a lot more than five stars,0.288483,0.300915
