# Load Dataset

In [None]:
import os
path = 'movie_reviews/neg/'
list_of_files_neg = os.listdir(path)
#There are 1000 positive and 1000 negative reviews in this dataset
review = []
sentiments = []
for fil_name in list_of_files_neg:
    name = path + str(fil_name)
    with open(name, 'r') as f:
        data = f.read()
        review.append(data)
        sentiments.append('neg')

path = 'movie_reviews/pos'
list_of_files_pos = os.listdir(path)
for fil_name in list_of_files_pos:
    name = path + str(fil_name)
    with open(name, 'r') as f:
        data = f.read()
        review.append(data) 
        sentiments.append('pos')

# Preprocessing - Choose which steps to use in any specific order 

###  Remove period(.), comma(,), semi-colon(;), etc. and convert everything to lower case

In [None]:
import re

REPLACE_WITH_NO_SPACE = re.compile("(\.)|(\;)|(\:)|(\!)|(\')|(\?)|(\,)|(\")|(\()|(\))|(\[)|(\])|(\d+)|\\n|_")
REPLACE_WITH_SPACE = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)")
NO_SPACE = ""
SPACE = " "

def remove_symbols(reviews):
    
    reviews = [REPLACE_WITH_NO_SPACE.sub(NO_SPACE, line.lower()) for line in reviews]
    reviews = [REPLACE_WITH_SPACE.sub(SPACE, line) for line in reviews]
    
    return reviews

### Remove Stopwords

In [None]:
import nltk
#nltk.download('stopwords') #Download using this line if it has not been downloaded earlier
from nltk.corpus import stopwords

english_stop_words = stopwords.words('english')
def remove_stop_words(corpus):
    removed_stop_words = []
    for review in corpus:
        removed_stop_words.append(' '.join([word for word in review.split() if word not in english_stop_words]))
    return removed_stop_words

### Stemming

In [None]:
def get_stemmed_text(corpus):
    from nltk.stem.porter import PorterStemmer
    stemmer = PorterStemmer()

    return [' '.join([stemmer.stem(word) for word in review.split()]) for review in corpus]

### Lemmatization

In [None]:
#nltk.download('wordnet') #To download wordnet
def get_lemmatized_text(corpus):
    
    from nltk.stem import WordNetLemmatizer
    lemmatizer = WordNetLemmatizer()
    return [' '.join([lemmatizer.lemmatize(word) for word in review.split()]) for review in corpus]

### Try using different combination to get optimal results

In [None]:
review = remove_symbols(review)
review = remove_stop_words(review)
review = get_stemmed_text(review)
review = get_lemmatized_text(review)

# Vectorizer

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(binary=True)
cv.fit(review)
reviews = cv.transform(review)

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

# Train model - Logistic Regression

In [None]:
def fit_by_lr(X, Y, c=0.25):
    from sklearn.linear_model import LogisticRegression
    X_train, X_val, Y_train, Y_val = train_test_split(X, Y, train_size = 0.80, shuffle=True)
    lr = LogisticRegression(C=c)
    lr.fit(X_train, Y_train)
    print ("Accuracy with Logistic Regression: %s" % ( accuracy_score(Y_val, lr.predict(X_val))))

# Train model - SVM

In [None]:
def fit_by_SVM(X, Y, c=1):
    from sklearn.svm import LinearSVC
    X_train, X_val, Y_train, Y_val = train_test_split(X, Y, train_size = 0.80, shuffle=True)
    svm = LinearSVC(C=c)
    svm.fit(X_train, Y_train)
    print ("Accuracy with SVM: %s" % ( accuracy_score(Y_val, svm.predict(X_val))))

In [None]:
#Test which model works better
fit_by_lr(reviews, sentiments)
fit_by_SVM(reviews, sentiments)

# Predict sentiment for some new reviews

In [None]:
new_reviews = ['exciting , scary , great special effects ( unlike other summer movies the effects do not take over the movie and are only there when it is really necessary ) and good performances .' , 'from start to finish , this movie does not let go of you .']

In [None]:
new_transformed = cv.transform(new_review)