# NLP Demo Logistic Regression

In [1]:
import pandas as pd

In [2]:
filepath_dict = {
    'yelp': 'data/yelp_labelled.txt',
    'amazon': 'data/amazon_cells_labelled.txt',
    'imdb': 'data/imdb_labelled.txt'
}

df_list = []

for source, filepath in filepath_dict.items():
  df = pd.read_csv(filepath, names=['sentence', 'label'], sep='\t')
  df['source'] = source
  df_list.append(df)

df = pd.concat(df_list)

df.head()

Unnamed: 0,sentence,label,source
0,Wow... Loved this place.,1,yelp
1,Crust is not good.,0,yelp
2,Not tasty and the texture was just nasty.,0,yelp
3,Stopped by during the late May bank holiday of...,1,yelp
4,The selection on the menu was great and so wer...,1,yelp


## Logistic Regression

In [3]:
df_yelp = df[df['source'] == 'yelp']

sentences = df_yelp['sentence'].values
labels = df_yelp['label'].values

In [4]:
from sklearn.model_selection import train_test_split

In [5]:
sentences_train, sentences_test, y_train, y_test = train_test_split(sentences, labels, test_size=0.25, random_state=1000)

In [6]:
from sklearn.feature_extraction.text import CountVectorizer

In [7]:
vectorizer = CountVectorizer()
vectorizer.fit(sentences_train)

CountVectorizer()

In [8]:
X_train = vectorizer.transform(sentences_train)
X_test = vectorizer.transform(sentences_test)

In [9]:
from sklearn.linear_model import LogisticRegression

In [10]:
classifier = LogisticRegression()
classifier.fit(X_train, y_train)
train_score = classifier.score(X_train, y_train)
test_score = classifier.score(X_test, y_test)

In [11]:
print(f'train: {train_score:.4f}  test: {test_score:.4f}')

train: 0.9853  test: 0.7960


In [12]:
for source in df['source'].unique():
  df_source = df[df['source'] == source]
  sentences = df_source['sentence'].values
  labels = df_source['label'].values

  sentences_train, sentences_test, y_train, y_test = train_test_split(sentences, labels, test_size=0.25, random_state=1000)

  vectorizer = CountVectorizer()
  vectorizer.fit(sentences_train)
  X_train = vectorizer.transform(sentences_train)
  X_test = vectorizer.transform(sentences_test)

  classifier = LogisticRegression()
  classifier.fit(X_train, y_train)
  train_score = classifier.score(X_train, y_train)
  test_score = classifier.score(X_test, y_test)

  print(f'{source}: train: {train_score:.4f}  test: {test_score:.4f}')

yelp: train: 0.9853  test: 0.7960
amazon: train: 0.9920  test: 0.7960
imdb: train: 0.9929  test: 0.7487
