<a href="https://colab.research.google.com/github/weiweitoo/airline-twitter-sentiment/blob/master/model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/googlecolab/colabtools/blob/master/notebooks/colab-github-demo.ipynb)

In [0]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
# !pip install torch_nightly -f https://download.pytorch.org/whl/nightly/cu92/torch_nightly.html
# !pip install fastai

In [0]:
import fastai
from fastai import *
from fastai.text import * 
import pandas as pd
import numpy as np
from functools import partial
import io
import os
from sklearn.model_selection import train_test_split

In [0]:
root_path = "drive/My Drive/Colab Notebooks/AirlineTweetSentiment"
df = pd.read_csv(root_path + "/Tweets.csv")

In [0]:
total_data_size = df.shape[0]
df_high_confident = df[df['airline_sentiment_confidence'] > 0.67]
confident_data_size = df_high_confident.shape[0]
print(total_data_size)
print(confident_data_size)
print(df['airline_sentiment'].unique())

14640
12283
['neutral' 'positive' 'negative']


# Sentiment Analysis

In [0]:
X = df_high_confident['text']
y = df_high_confident['airline_sentiment']

In [0]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [0]:
col_names = ['labels','text']
df_trn = pd.DataFrame({'text':X_train, 'labels':y_train}, columns=col_names)
df_val = pd.DataFrame({'text':X_test, 'labels':y_test}, columns=col_names)

In [0]:
df_val.shape

(3685, 2)

In [0]:
data_lm = TextLMDataBunch.from_df('./', train_df=df_trn, valid_df=df_val)
data_clas = TextClasDataBunch.from_df(path = "./", train_df = df_trn, valid_df = df_val, vocab=data_lm.train_ds.vocab, bs=32)

data_lm.save('tmp_lm')
data_clas.save('tmp_clas')

In [0]:
data_lm.show_batch()

In [0]:
data_lm = TextLMDataBunch.load('./', 'tmp_lm')
data_clas = TextClasDataBunch.load('./', 'tmp_clas')

In [0]:
learn = language_model_learner(data_lm, AWD_LSTM, drop_mult=0.5)

In [0]:
learn.unfreeze()
learn.fit_one_cycle(1, slice(2e-3/100, 2e-3))

In [0]:
learn.predict("This is a review about", n_words=10)

In [0]:
learn.save_encoder('ft_enc')
learn = text_classifier_learner(data_clas, AWD_LSTM, drop_mult=0.5)
learn.load_encoder('ft_enc')

In [0]:
data_clas.show_batch()

In [0]:
learn.unfreeze()
learn.fit_one_cycle(1, slice(2e-3/100, 2e-3))

In [0]:
learn.predict("This was a great movie!")

## Sentiment Analysis Training

In [0]:
def construct_input(X, y, test_size):
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)
  col_names = ['labels','text']
  df_trn = pd.DataFrame({'text':X_train, 'labels':y_train}, columns=col_names)
  df_val = pd.DataFrame({'text':X_test, 'labels':y_test}, columns=col_names)
  data_lm = TextLMDataBunch.from_df('./', train_df=df_trn, valid_df=df_val)
  data_clas = TextClasDataBunch.from_df(path = "./", train_df = df_trn, valid_df = df_val, vocab=data_lm.train_ds.vocab, bs=32)
  data_lm.save('tmp_lm')
  data_clas.save('tmp_clas')
  return data_lm, data_clas

def learn_language_model(data_lm):
  learn = language_model_learner(data_lm, AWD_LSTM, drop_mult=0.5)
  learn.freeze_to(-2)
  learn.fit_one_cycle(1, slice(2e-3/100, 2e-3))
  learn.save_encoder("ft_enc")
  return learn

def learn_classification_model(data_clas):
  learn = text_classifier_learner(data_clas, AWD_LSTM, drop_mult=0.5)
  learn.load_encoder('ft_enc')
  learn.freeze_to(-2)
  learn.fit_one_cycle(1, slice(2e-3/100, 2e-3))
  return learn

In [0]:
X = df_high_confident['text']
y = df_high_confident['airline_sentiment']
data_lm, data_clas = construct_input(X, y, 0.3)

In [0]:
learner_lm = learn_language_model(data_lm)
learner_clas = learn_classification_model(data_clas)

epoch,train_loss,valid_loss,accuracy,time
0,5.359085,4.748312,0.202126,08:46


epoch,train_loss,valid_loss,accuracy,time
0,0.724342,0.560413,0.797015,07:03


## Negative Reason Prediction

In [0]:
X = df_high_confident['text']
y = df_high_confident['negativereason']

In [0]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [0]:
data_lm, data_clas = construct_input(X, y, 0.3)

In [0]:
learner_lm = learn_language_model(data_lm)
learner_clas = learn_classification_model(data_clas)