# File 10: Classification Model to Improve Accuracy

### Input Files:
- 04-train-data.csv

### Output Files:
 - 10-positive-word-score.csv
 - 10-negative-word-score.csv

### Steps:
1. loading required python libraries
1. loading training datase
1. seperating positive and negative labelled tweets
1. creating counters
1. counting the number of postive words
1. counting the number of negative words
1. calculating postive/negative ratio
1. seperating positive and negative words
1. normalizing score
1. saving datasets

In [None]:
# loading required python libraries
import nltk
from collections import Counter

In [None]:
# loading training datase
df = pd.read_csv("../db/04-train-data.csv")
df = df[['SENTIMENT', 'TEXT']]
df.TEXT = df.TEXT.astype('str')

In [None]:
# seperating positive and negative labelled tweets
pos = df.loc[df.SENTIMENT == 1].TEXT.values.tolist()
neg = df.loc[df.SENTIMENT == 0].TEXT.values.tolist()

In [None]:
# creating counters
pos_counts = Counter()
neg_counts = Counter()
total_counts = Counter()

In [None]:
# counting the number of postive words
for i in range(len(pos)):
    for word in pos[i].lower().split(" "):
        pos_counts[word]+=1
        total_counts[word]+=1

In [None]:
# counting the number of negative words
for i in range(len(neg)):
    for word in neg[i].lower().split(" "):
        neg_counts[word]+=1
        total_counts[word]+=1

In [None]:
# calculating postive/negative ratio
pos_neg_score = Counter()
for term,cnt in list(total_counts.most_common()):
    if(cnt > 100):
        pos_neg_ratio = pos_counts[term] / float(neg_counts[term] + 1)
        pos_neg_score[term] = pos_neg_ratio

In [None]:
# seperating positive and negative words
pnscore_list = pos_neg_score.most_common()
pnscore = pd.DataFrame(pnscore_list, columns = ['WORD', 'SCORE']) 
pnscore = pnscore.loc[pnscore.SCORE < 30]
pscore = pnscore[:4000].reset_index(drop=True)
nscore = pnscore[4000:].reset_index(drop=True)

In [None]:
# normalizing score
minimum = min(pscore.SCORE.values.tolist())
maximum = max(pscore.SCORE.values.tolist())
words = pscore.WORD.values.tolist()
score = pscore.SCORE.values.tolist()
new_score = []
for value in score :
    new_score.append((value - minimum)/(maximum - minimum))
        
pscore = pd.DataFrame(list(zip(words, new_score)), columns=['WORDS', 'SCORE'])

minimum = min(nscore.SCORE.values.tolist())
maximum = max(nscore.SCORE.values.tolist())
words = nscore.WORD.values.tolist()
score = nscore.SCORE.values.tolist()
new_score = []
for value in score :
    new_value = float(-(1 - ((value - minimum)/(maximum - minimum))))
    new_score.append(new_value)
        
nscore = pd.DataFrame(list(zip(words, new_score)), columns=['WORDS', 'SCORE'])

In [None]:
# saving datasets
pscore.to_csv('../db/10-positive-word-score.csv', index=False)
nscore.to_csv('../db/10-negative-word-score.csv', index=False)