### Importing dataset

In [1]:
import numpy as np
import pandas as pd
train_df=pd.read_csv('train.tsv',delimiter='\t')

In [11]:
train_df.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2


In [2]:
train_df.describe()

Unnamed: 0,PhraseId,SentenceId,Sentiment
count,156060.0,156060.0,156060.0
mean,78030.5,4079.732744,2.063578
std,45050.785842,2502.764394,0.893832
min,1.0,1.0,0.0
25%,39015.75,1861.75,2.0
50%,78030.5,4017.0,2.0
75%,117045.25,6244.0,3.0
max,156060.0,8544.0,4.0


In [3]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 156060 entries, 0 to 156059
Data columns (total 4 columns):
PhraseId      156060 non-null int64
SentenceId    156060 non-null int64
Phrase        156060 non-null object
Sentiment     156060 non-null int64
dtypes: int64(3), object(1)
memory usage: 4.8+ MB


### Splitting Data into train and validation set

In [4]:
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(train_df["Phrase"], train_df["Sentiment"], test_size=0.2)

### Vectorizing using Scikit-Learn

Computers don't understand Texts, so we need to convert texts to numbers before we could do any math on it and see if we can build a system to classify a review as Positive or Negative.
Ways to vectorize data:

- Bag of Words
- TF-IDF
- Word Embeddings (Word2Vec)

In [5]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000, binary=True, stop_words="english")
cv.fit(X_train)
X_train_cv = cv.transform(X_train)
X_valid_cv = cv.transform(X_valid)

### Building RandomForestClassifier Model

In [6]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(min_samples_leaf=3, n_estimators=25, n_jobs=-1)
classifier.fit(X_train_cv, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=3, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=25, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

### Loading Test Data

In [7]:
test_df = pd.read_csv("test.tsv",delimiter='\t')
test_df.head()


Unnamed: 0,PhraseId,SentenceId,Phrase
0,156061,8545,An intermittently pleasing but mostly routine ...
1,156062,8545,An intermittently pleasing but mostly routine ...
2,156063,8545,An
3,156064,8545,intermittently pleasing but mostly routine effort
4,156065,8545,intermittently pleasing but mostly routine


### Vectorizing the Feature:Phrase Text

In [8]:
X_test = test_df.Phrase
X_test_vect = cv.transform(X_test)

### Predicting the Sentiment for test data

In [9]:
y_test_pred = classifier.predict(X_test_vect)

### Creating Dataframe and csv file

In [10]:
df = pd.DataFrame({"PhraseId": test_df.PhraseId,"Sentiment": y_test_pred})
#df.to_csv("output.csv", index=False)
df.head()

Unnamed: 0,PhraseId,Sentiment
0,156061,3
1,156062,3
2,156063,2
3,156064,3
4,156065,3
