In [4]:
from google.colab import drive
drive.mount("/content/grive")

Mounted at /content/grive


In [18]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

#Data Preprocessing and Feature Engineering
import string
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

#Model Selection and Validation
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report,accuracy_score

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [6]:
data = pd.read_csv("/content/grive/MyDrive/labeled_data.csv",index_col=0)

  mask |= (ar1 == a)


In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1226258 entries, 0 to 1226257
Data columns (total 5 columns):
 #   Column         Non-Null Count    Dtype  
---  ------         --------------    -----  
 0   publish_date   1226258 non-null  int64  
 1   headline_text  1226258 non-null  object 
 2   Polarity       1226258 non-null  float64
 3   Subjectivity   1226258 non-null  float64
 4   Analysis       1226258 non-null  object 
dtypes: float64(2), int64(1), object(2)
memory usage: 56.1+ MB


In [8]:
data['Analysis'].unique()

array(['Neutral', 'Positive', 'Negative'], dtype=object)

In [9]:
print(data[data['Analysis']=="Positive"]['Analysis'].count())
print(data[data['Analysis']=="Negative"]['Analysis'].count())
print(data[data['Analysis']=="Neutral"]['Analysis'].count())

209676
156163
860419


In [10]:
data['labels'] = data['Analysis'].map({'Neutral':0,'Positive':1,'Negative':-1})

In [11]:
data.head()

Unnamed: 0,publish_date,headline_text,Polarity,Subjectivity,Analysis,labels
0,20030219,aba decides against community broadcasting lic...,0.0,0.0,Neutral,0
1,20030219,act fire witnesses must be aware of defamation,0.25,0.25,Positive,1
2,20030219,a g calls for infrastructure protection summit,0.0,0.0,Neutral,0
3,20030219,air nz staff in aust strike for pay rise,0.0,0.0,Neutral,0
4,20030219,air nz strike to affect australian travellers,0.0,0.0,Neutral,0


In [None]:
punct =string.punctuation
stop_words = stopwords.words('english')
clean_data = []
def data_preprocessing(news):
  words =[]

  for word in nltk.word_tokenize(news):
    if word not in punct:
        if word not in stop_words:
          words.append(word)
  clean_data.append(" ".join(words))      
  

data['headline_text'].apply(data_preprocessing)

In [13]:
normalize = []
def normalization(clean_data):
  lem = WordNetLemmatizer()
    
  for headline in clean_data:
    normalized_tweet = []
    for word in nltk.word_tokenize(headline):
        normalized_text = lem.lemmatize(word,'v')
        normalized_tweet.append(normalized_text)
    normalize.append(" ".join(normalized_tweet))
    
normalization(clean_data)

In [14]:
pipeline = Pipeline([
    ('bow',CountVectorizer()),  # strings to token integer counts
    ('tfidf', TfidfTransformer()),  # integer counts to weighted TF-IDF scores
    ('classifier', MultinomialNB()),  # train on TF-IDF vectors w/ Naive Bayes classifier
])

In [15]:
msg_train, msg_test, label_train, label_test = train_test_split(data['headline_text'], data['labels'], test_size=0.2)
pipeline.fit(msg_train,label_train)
predictions = pipeline.predict(msg_test)


In [16]:
print(classification_report(predictions,label_test))
print(confusion_matrix(predictions,label_test))
print(accuracy_score(predictions,label_test))

              precision    recall  f1-score   support

          -1       0.64      0.95      0.76     21161
           0       1.00      0.91      0.95    189098
           1       0.78      0.93      0.85     34993

    accuracy                           0.91    245252
   macro avg       0.80      0.93      0.85    245252
weighted avg       0.94      0.91      0.92    245252

[[ 20069    142    950]
 [  9250 171554   8294]
 [  2152    332  32509]]
0.9138844943160505


In [17]:
import pickle
filename = 'finalized_model.sav'
pickle.dump(pipeline, open(filename, 'wb'))