<!-- Tala Vahedi
October 30, 2021
Week Ten Assignment

Script Purpose: Perform the Scikit-Learn Tutorial
Script Version: 1.0 
Script Author:  Tala Vahedi, University of Arizona

Script Revision History:
Version 1.0 Oct 30, 2021, Python 3.x

Using the Scikit-Learn Tutorial found here:  https://www.dataquest.io/blog/sci-kit-learn-tutorial/ 
Perform the operations from the beginning of the Tutorial stopping when you get to the Building the 
Model Section.

Create a Report that captures each major steps of the process and provide a short write up of what 
you learned during that step and what areas are you still confused about.
 -->

In [179]:
# Psuedo Constants
SCRIPT_NAME    = "Script: Perform Sentiment Analysis using K-Nearest Neighbor Classification model "
SCRIPT_VERSION = "Version 1.0"
SCRIPT_AUTHOR  = "Author: Tala Vahedi"

# Python Standard Library
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

In [180]:
# Print Basic Script Information
print()
print(SCRIPT_NAME)
print(SCRIPT_VERSION)
print(SCRIPT_AUTHOR)
print()  


Script: Perform Sentiment Analysis using K-Nearest Neighbor Classification model 
Version 1.0
Author: Tala Vahedi



In [181]:
# Read in the data with `read_csv()`
data = pd.read_csv("nuclearPower.csv", dtype=str)

# Using .head() method to view the first few records of the data set
data.head()

Unnamed: 0,tweet_text,sentiment,sentiment_confidence_summary
0,:Hello Japan is a nuclear power plant crisis. ...,Negative,"""Neutral / author is just sharing information""..."
1,. Renewable Energy Consumption Tops Nuclear fo...,Neutral / author is just sharing information,"""Neutral / author is just sharing information""..."
2,. Will liberals now seek to eliminate dangerou...,Neutral / author is just sharing information,"""Neutral / author is just sharing information""..."
3,"(Aug 22,2011)Plant Status of Fukushima Daiichi...",Neutral / author is just sharing information,"""Neutral / author is just sharing information""..."
4,[ebook] Nuclear Energy and the Environment: Ox...,Neutral / author is just sharing information,"""Neutral / author is just sharing information""..."


In [182]:
# using the dtypes() method to display the different datatypes available
data.dtypes

tweet_text                      object
sentiment                       object
sentiment_confidence_summary    object
dtype: object

In [183]:
# preprocess data: make lower case
data = data.apply(lambda x: x.astype(str).str.lower())

data.head()

Unnamed: 0,tweet_text,sentiment,sentiment_confidence_summary
0,:hello japan is a nuclear power plant crisis. ...,negative,"""neutral / author is just sharing information""..."
1,. renewable energy consumption tops nuclear fo...,neutral / author is just sharing information,"""neutral / author is just sharing information""..."
2,. will liberals now seek to eliminate dangerou...,neutral / author is just sharing information,"""neutral / author is just sharing information""..."
3,"(aug 22,2011)plant status of fukushima daiichi...",neutral / author is just sharing information,"""neutral / author is just sharing information""..."
4,[ebook] nuclear energy and the environment: ox...,neutral / author is just sharing information,"""neutral / author is just sharing information""..."


In [184]:
# preprocess data: remove non ascii characters
data.tweet_text.replace({r'[^\x00-\x7F]+':''}, regex=True, inplace=True)
data.sentiment.replace({r'[^\x00-\x7F]+':''}, regex=True, inplace=True)
data.sentiment_confidence_summary.replace({r'[^\x00-\x7F]+':''}, regex=True, inplace=True)

data.head()

Unnamed: 0,tweet_text,sentiment,sentiment_confidence_summary
0,:hello japan is a nuclear power plant crisis. ...,negative,"""neutral / author is just sharing information""..."
1,. renewable energy consumption tops nuclear fo...,neutral / author is just sharing information,"""neutral / author is just sharing information""..."
2,. will liberals now seek to eliminate dangerou...,neutral / author is just sharing information,"""neutral / author is just sharing information""..."
3,"(aug 22,2011)plant status of fukushima daiichi...",neutral / author is just sharing information,"""neutral / author is just sharing information""..."
4,[ebook] nuclear energy and the environment: ox...,neutral / author is just sharing information,"""neutral / author is just sharing information""..."


In [186]:
# converting dtype to string 
data["tweet_text"]= data["tweet_text"].astype(str) 
data["sentiment"]= data["sentiment"].astype(str) 
data["sentiment_confidence_summary"]= data["sentiment_confidence_summary"].astype(str) 

# removing special characters in all columns 
data["tweet_text"]= data['tweet_text'].map(lambda x: re.sub(r'\W+', ' ', x))
data["sentiment"]= data['sentiment'].map(lambda x: re.sub(r'\W+', ' ', x))
data["sentiment_confidence_summary"]= data['sentiment_confidence_summary'].map(lambda x: re.sub(r'\W+', ' ', x))

data.head()

Unnamed: 0,tweet_text,sentiment,sentiment_confidence_summary
0,hello japan is a nuclear power plant crisis l...,negative,neutral author is just sharing information 0 ...
1,renewable energy consumption tops nuclear for...,neutral author is just sharing information,neutral author is just sharing information 1 0
2,will liberals now seek to eliminate dangerous...,neutral author is just sharing information,neutral author is just sharing information 0 ...
3,aug 22 2011 plant status of fukushima daiichi...,neutral author is just sharing information,neutral author is just sharing information 1 0
4,ebook nuclear energy and the environment oxf ...,neutral author is just sharing information,neutral author is just sharing information 1 0


In [187]:
# preprocess data: remove stop words
with open("STOP_WORDS.txt", 'r') as stopWordList:
    stopWords = stopWordList.read()
STOP_WORDS = stopWords.split()

data['tweet_text'] = data['tweet_text'].apply(lambda x: ' '.join([item for item in x.split() if item not in STOP_WORDS]))
data['sentiment'] = data['sentiment'].apply(lambda x: ' '.join([item for item in x.split() if item not in STOP_WORDS]))
data['sentiment_confidence_summary'] = data['sentiment_confidence_summary'].apply(lambda x: ' '.join([item for item in x.split() if item not in STOP_WORDS]))

data.head()

Unnamed: 0,tweet_text,sentiment,sentiment_confidence_summary
0,hello japan nuclear power plant crisis link,negative,neutral author sharing 0 2 negative 0 8
1,renewable energy consumption tops nuclear firs...,neutral author sharing,neutral author sharing 1 0
2,liberals now seek eliminate dangerous nuclear ...,neutral author sharing,neutral author sharing 0 667 negative 0 333
3,aug 22 2011 plant status fukushima daiichi nuc...,neutral author sharing,neutral author sharing 1 0
4,ebook nuclear energy environment oxf ord un iv...,neutral author sharing,neutral author sharing 1 0


In [203]:
# splitting the sentiment target variable into only negative, neutral and positive tags
data['sentiment'] = data['sentiment'].str.split(' ').str[0]
# removing the weak confidence scores for the sentiments
data['sentiment_confidence_summary'] = data['sentiment_confidence_summary'].str.split(' ').str[0]

# filtering out rows that dont have negative confidence into a new dataframe
dataPreprocessed = data[data['sentiment_confidence_summary'] != "negative"]
# dropping the sentiment confidence column since it is no longer needed
dataPreprocessed = dataPreprocessed.drop(columns=['sentiment_confidence_summary'])
dataPreprocessed.head()

Unnamed: 0,tweet_text,sentiment
0,hello japan nuclear power plant crisis link,negative
1,renewable energy consumption tops nuclear firs...,neutral
2,liberals now seek eliminate dangerous nuclear ...,neutral
3,aug 22 2011 plant status fukushima daiichi nuc...,neutral
4,ebook nuclear energy environment oxf ord un iv...,neutral


In [205]:
# convert the sales data features that have string values into numeric labels
print("tweet : ", dataPreprocessed['tweet_text'].unique())
print("sentiment : ", dataPreprocessed['sentiment'].unique())

tweet :  ['hello japan nuclear power plant crisis link'
 'renewable energy consumption tops nuclear firsttime link nuclear energy renewable'
 'liberals now seek eliminate dangerous nuclear power plants link green energy news'
 'aug 22 2011 plant status fukushima daiichi nuclear power station 3 00 pm aug 22 link tepco japan'
 'ebook nuclear energy environment oxf ord un iversity pr ess 2011 isbn 0841225850 432 p link'
 'gjobs nuclear power safety campaign organizer temporary union concerned scientists cambridge ma link'
 'news nuclear power plant radioactive tritium contamination vermont link nwo'
 'science nuclear energy environment oxf ord un iversity pr ess 2011 isbn 0841225850 432 link'
 'link p windmills produce power one average nuclear power station windmills'
 'link quot thing quot m nuclear energy correctly plant built 1970 s standard'
 'link 6 min video masao yoshida fukushima diichi nuclear station all aid reports current progress'
 'link 74 percent favor gradual reduction nu

In [206]:
#import the necessary module
from sklearn import preprocessing
# create the Labelencoder object
le = preprocessing.LabelEncoder()
#convert the categorical columns into numeric
dataPreprocessed['tweet_text'] = le.fit_transform(dataPreprocessed['tweet_text'])
dataPreprocessed['sentiment'] = le.fit_transform(dataPreprocessed['sentiment'])
#display the initial records
dataPreprocessed.head()

Unnamed: 0,tweet_text,sentiment
0,3,0
1,69,1
2,4,1
3,0,1
4,1,1


In [208]:
#assigning the target variable
target = dataPreprocessed['sentiment']
dataPreprocessed.head(10)

Unnamed: 0,tweet_text,sentiment
0,3,0
1,69,1
2,4,1
3,0,1
4,1,1
5,2,1
6,68,1
7,186,1
8,14,1
9,16,2


In [209]:
#import the necessary module
from sklearn.model_selection import train_test_split
#split data set into train and test sets
data_train, data_test, target_train, target_test = train_test_split(dataPreprocessed,target, test_size = 0.30, random_state = 10)

In [210]:
#import necessary modules
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
#create object of the lassifier
neigh = KNeighborsClassifier(n_neighbors=3)
#Train the algorithm
neigh.fit(data_train, target_train)
# predict the response
pred = neigh.predict(data_test)
# evaluate accuracy
print ("KNeighbors accuracy score : ",accuracy_score(target_test, pred))

KNeighbors accuracy score :  0.8596491228070176
