## Text Classification

### 1. Obtain Data

In [None]:
import pandas as pd
df = pd.read_csv("data/TheSocialDilemma.csv")
df.head()

### 2. Exploratory Data Analysis (EDA)

In [None]:
df.info()
df["Sentiment"].value_counts()

### 3. Data Preparation

In [None]:
df["label"] = df["Sentiment"].apply(lambda input:
                "positive" if input == "Positive" else "notpositive"
                )
df = df[["text", "label"]]
df.head()

In [None]:
from sklearn.model_selection import train_test_split

X = df["text"]
y = df["label"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.1, random_state=17
)

### 4. Model Fitting

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

pipeline = Pipeline([
    ("vectoriser", TfidfVectorizer()), 
    ("model", MultinomialNB())
    ])
pipeline.fit(X_train, y_train)

### 5. Model Evaluation

In [None]:
from sklearn import metrics

y_prediction = pipeline.predict(X_test)

accuracy = metrics.accuracy_score(y_pred=y_prediction, y_true=y_test)
confusion = metrics.confusion_matrix(y_pred=y_prediction, y_true=y_test)
print(accuracy)
print(confusion)

### 6. Model Application

In [None]:
prediction = pipeline.predict(["This movie totally sucks"])
print(prediction)