# Assignment #1
## Topic: Political Reviews - Generate Lexicon Sentiment

Name: **Hoo Yee Torng** </br>
Matrix: **P101447**

## 1.0 IMPORT LIBRARY
Import all the library needed and download the database for Spacy, NLTK Lemmatizer, Stopwords etc

In [1]:
import pandas as pd
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer
from nltk.corpus import stopwords
import re
import spacy

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nlp = spacy.load('en_core_web_sm')
nltk.download('averaged_perceptron_tagger')
stop_words = set(stopwords.words('english'))

lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


## 2.0 TEXT PROCESSING FUNCTION
Define text processig function 
* extract_words - Only return alphebert words in lower case
* get_adjective - Use both Spacy and NTLK to return adjective words
* lemmatized_remove_stopword - Lemmatized the words and remove stop words
* is_word_exist - To check if the word exist in the word list


In [0]:
# RETURN WORD ONLY
def extract_words(setence):
  letters = re.sub("[^a-zA-Z]", " ", setence)
  return(letters.lower())


# GET ADJECTIVE ONLY
# USE BOTH SPACY AND NTLK
def get_adjective_word(setence):
  words = []

  ## CHECK BY SPACY FIRST
  spacy_words = nlp(u''+setence+'')
  for token in spacy_words:
    if token.pos_ == 'ADV':
      words.append(token.text)

  ## CHECK BY NLTK
  nltk_words = nltk.word_tokenize(setence)
  for stc in nltk_words:
    nltk_token = nltk.pos_tag(nltk.word_tokenize(stc))
    if nltk_token[0][1] == 'JJ' and nltk_token[0][0] not in words:
      words.append(nltk_token[0][0])
    
  return words


# LEMMATIZED AND REMOVE STOP WORD
def lemmatized_remove_stopword(sentence):
    new_line = []
    word_tokens = nltk.word_tokenize(sentence)
    for w in word_tokens:
      output_word = lemmatizer.lemmatize(w)
      if output_word not in stop_words:
        new_line.append(output_word)
    
    return (" ".join(new_line) + " ").strip()

def is_word_exist(word, words):
  for i in range(len(words)):
    savedword = words[i][0].strip()
    if savedword == word.strip():
      return True
  
  return False

### 3.0 GENERATE LEXICON SENTIMENT FILE
This is the function to generate the lexicon sentiment csv file

In [0]:
def create_lexicon_sentiment(source_df, score, export_file_name):

  words = []

  ttl_adj_words = 0
  ttl_words = 0

  for i in range(len(source_df)):
    input_review = source_df.loc[i, "original"]
    input_review = extract_words(input_review)
    input_review = lemmatized_remove_stopword(input_review)
    input_words = get_adjective_word(input_review)

    for word in input_words:
      ttl_adj_words += 1

      if not is_word_exist(word, words):
        ttl_words += 1
        output_word = [word, score]
        words.append(output_word)

  print("Total Adjective Words: {}".format(ttl_adj_words))
  print("Total Final Words: {}".format(ttl_words))

  outdf = pd.DataFrame(words)
  outdf.rename(columns={0: "word", 1:"Score"}, errors="raise", inplace=True)
  outdf.to_csv(export_file_name, index=False)

  print(export_file_name + " is created!")

  return outdf


### 4.0 GENERATE POSITIVE LEXICON SENTIMENT

In [4]:
positive_df = pd.read_csv('politic_issues_positive_reviews.csv', index_col=None)  
positive_df.rename(columns={"Google Translate": "original"}, errors="raise", inplace=True)
out_pos_df = create_lexicon_sentiment(positive_df, 1, "lexicon_political_positive.csv")

Total Adjective Words: 158
Total Final Words: 104
lexicon_political_positive.csv is created!


### 5.0 GENERATE NEGATIVE LEXICON SENTIMENT



In [5]:
negative_df = pd.read_csv('politic_issues_negative_reviews.csv', index_col=None)  
negative_df.rename(columns={"Google Translate": "original"}, errors="raise", inplace=True)
out_neg_df = create_lexicon_sentiment(negative_df, -1, "lexicon_political_negative.csv")

Total Adjective Words: 152
Total Final Words: 100
lexicon_political_negative.csv is created!


### 6.0 GENERATE MASTER LEXICON SENTIMENT
The code will combine both Positive and Negative into one master files. <b>due to some word are exist in both positive and negative, hence the code below will flag it as 0. User need to manually update the 0 to either 1 or -1 if needed</b>

In [6]:
pos_list = out_pos_df.values.tolist()
neg_list = out_neg_df.values.tolist()
all_list = []

print("pos_list: " + str(len(pos_list)))
print("neg_list: " + str(len(neg_list)))
print("all_list: " + str(len(all_list)))

print("- COMPARE POSITIVE IN NEGATIVE")
for first_item in pos_list:
  selected_first_item = first_item[0].strip()
  for second_item in neg_list:
    selected_second_item = second_item[0].strip()
    if selected_first_item == selected_second_item:
      first_item = [selected_first_item, 0]
      break
  all_list.append(first_item)

print("pos_list: " + str(len(pos_list)))
print("neg_list: " + str(len(neg_list)))
print("all_list: " + str(len(all_list)))


print("- COMPARE NEGATIVE IN MASTER")
for first_item in neg_list:
  selected_first_item = first_item[0].strip()
  
  # IF THE WORD ALREADY ADDED
  ifexit = False
  for second_item in all_list:
    selected_second_item = second_item[0].strip()
    if selected_first_item == selected_second_item:
      # BREAK THE LOOP IF ALREADY EXIST
      ifexit = True
      break
  
  if not ifexit:
    all_list.append(first_item)

print("pos_list: " + str(len(pos_list)))
print("neg_list: " + str(len(neg_list)))
print("all_list: " + str(len(all_list)))

alldf = pd.DataFrame(all_list) 
alldf.rename(columns={0: "word", 1:"Score"}, errors="raise", inplace=True)
print(alldf)
alldf.to_csv("lexicon_political_master.csv", index=False)

pos_list: 104
neg_list: 100
all_list: 0
- COMPARE POSITIVE IN NEGATIVE
pos_list: 104
neg_list: 100
all_list: 104
- COMPARE NEGATIVE IN MASTER
pos_list: 104
neg_list: 100
all_list: 174
           word  Score
0          also      1
1       cynical      1
2    indigenous      0
3         aware      1
4    successful      0
..          ...    ...
169     umpteen     -1
170    actually     -1
171     exactly     -1
172       moral     -1
173   regularly     -1

[174 rows x 2 columns]
