## Text Classification

### 0 Importing needed functions and library

In [16]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

### 1 Handling data

In [11]:
filepath_dict = {'yelp':   'sentiment_labelled_sentences/yelp_labelled.txt',
                 'amazon': 'sentiment_labelled_sentences/amazon_cells_labelled.txt',
                 'imdb':   'sentiment_labelled_sentences/imdb_labelled.txt'}


df_list = []
for source, filepath in filepath_dict.items():
    df = pd.read_csv(filepath, names=['sentence', 'label'], sep='\t')
    df['source'] = source  # Add a new column for the source name
    df_list.append(df)

df_yelp = df_list[0]
df_amazon = df_list[1]
df_imdb= df_list[2]

### 2 Splitting Model

In [12]:
yelp_sentences = df_yelp['sentence'].values
yelp_y = df_yelp['label'].values
yelp_sentences_train, yelp_sentences_test, yelp_y_train, yelp_y_test = train_test_split(yelp_sentences, yelp_y, test_size=0.25, random_state=100)

amazon_sentences = df_amazon['sentence'].values
amazon_y = df_amazon['label'].values
amazon_sentences_train, amazon_sentences_test, amazon_y_train, amazon_y_test = train_test_split(amazon_sentences, amazon_y, test_size=0.25, random_state=100)

imdb_sentences = df_imdb['sentence'].values
imdb_y = df_imdb['label'].values
imdb_sentences_train, imdb_sentences_test, imdb_y_train, imdb_y_test = train_test_split(imdb_sentences, imdb_y, test_size=0.25, random_state=100)

### 3 Transforming the sentences to a numirical values 

In [14]:
vectorizer = CountVectorizer()
vectorizer.fit(yelp_sentences_train)

X_train_yelp = vectorizer.transform(yelp_sentences_train)
X_test_yelp  = vectorizer.transform(yelp_sentences_test)
X_train_yelp 

vectorizer = CountVectorizer()
vectorizer.fit(amazon_sentences_train)

X_train_amazon = vectorizer.transform(amazon_sentences_train)
X_test_amazon  = vectorizer.transform(amazon_sentences_test)
X_train_amazon 

vectorizer = CountVectorizer()
vectorizer.fit(imdb_sentences_train)

X_train_imdb = vectorizer.transform(imdb_sentences_train)
X_test_imdb  = vectorizer.transform(imdb_sentences_test)
X_train_imdb 

<561x2517 sparse matrix of type '<class 'numpy.int64'>'
	with 8495 stored elements in Compressed Sparse Row format>

### Create a model and calculate the accurcy 

In [17]:
classifier_yelp = LogisticRegression()
classifier_yelp.fit(X_train_yelp, yelp_y_train)
score_yelp = classifier_yelp.score(X_test_yelp, yelp_y_test)
score_yelp

0.808

In [19]:
classifier_yelp = LogisticRegression()
classifier_yelp.fit(X_train_yelp, yelp_y_train)
score_yelp = classifier_yelp.score(X_test_yelp, yelp_y_test)
score_yelp

0.808

In [18]:
classifier_amazon = LogisticRegression()
classifier_amazon.fit(X_train_amazon, amazon_y_train)
score_amazon = classifier_amazon.score(X_test_amazon, amazon_y_test)
score_amazon

0.82

In [20]:
classifier_imdb = LogisticRegression()
classifier_imdb.fit(X_train_imdb, imdb_y_train)
score_imdb = classifier_imdb.score(X_test_imdb, imdb_y_test)
score_imdb

0.7379679144385026