In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import cross_val_predict

In [6]:
import os

data_dir = './review_polarity/txt_sentoken/'

classes = ['pos', 'neg']

### Read the data

train_data = []
train_labels = []
test_data = []
test_labels = []

for curr_class in classes:
	dirname = os.path.join(data_dir, curr_class)
	for fname in os.listdir(dirname):
	    with open(os.path.join(dirname, fname), 'r') as f:
	        content = f.read()
	        if fname.startswith('cv9'):
	            test_data.append(content)
	            test_labels.append(curr_class)
	        else:
	            train_data.append(content)
	            train_labels.append(curr_class)

In [7]:
### Create feature vectors

vectorizer = TfidfVectorizer(min_df=5,
                             max_df = 0.8,
                             sublinear_tf=True,
                             use_idf=True)

In [8]:
train_vectors = vectorizer.fit_transform(train_data)
test_vectors = vectorizer.transform(test_data)

In [9]:
### Perform  a logistic regression model, and fit with X and y

reg_log = LogisticRegression()

reg_log.fit(train_vectors, train_labels).score(test_vectors, test_labels)

prediction_reg_log = reg_log.predict(test_vectors)

In [10]:
print(classification_report(test_labels, prediction_reg_log)) 

             precision    recall  f1-score   support

        neg       0.89      0.90      0.90       100
        pos       0.90      0.89      0.89       100

avg / total       0.90      0.90      0.89       200



In [11]:
print(confusion_matrix(test_labels, reg_log.predict(test_vectors)))

[[90 10]
 [11 89]]


In [12]:
predicted = cross_val_predict(reg_log, train_vectors, train_labels, cv=10)
print("Cross validation %s" % accuracy_score(train_labels, predicted))

Cross validation 0.863333333333


In [13]:
sent = "I love this ugly dog with bad apearance"

In [14]:
# vectorize the sentence

sent = vectorizer.transform([sent])

In [15]:
reg_log.predict(sent)

array(['neg'], 
      dtype='<U3')