## Sentiment Analysis - Amazon baby data

In [None]:
import pandas as pd
products = pd.read_csv(f"D:/Docs/amazon_baby.csv")
#products = pd.read_csv(f"D:/SYED/data/amazon_baby.csv")
products.info()

## Remove the NAN values in review column

In [None]:
products = products.fillna({'review':''})  # fill in N/A's in the review column
products['review'].isnull().values.any()

## Remove punctuation marks

In [None]:
import string
products['review_clean'] = products['review'].apply(lambda x: x.translate(str.maketrans('', '', 
                           string.punctuation)))
products.head(3)

### Remove rating of 3

In [None]:
products = products[products['rating'] != 3]
products.head(3)

### More than 3 is +1 else -1

In [None]:
products['sentiment'] = products['rating'].apply(lambda rating : +1 if rating > 3 else -1)
products['review'][27401]

## Load the data as test and train
* `train-idx.json` for Training data
* `test-idx.json` for Test data

In [None]:
train_data_indices = pd.read_json(f"D:/repos/CourseraPlus/1_ML_Combined_Courses/Course/Week-2/train-idx.json")
train_data = pd.DataFrame(products, index = train_data_indices[0])
test_data_indices = pd.read_json(f"D:/repos/CourseraPlus/1_ML_Combined_Courses/Course/Week-2/test-idx.json")
test_data = pd.DataFrame(products, index = test_data_indices[0])
print("Train set: " + str(train_data.shape))
print("Test set: " + str(test_data.shape))

### Remove all the NaN before carrying out training

In [None]:
train_data.dropna(subset = ["name"], inplace=True)
train_data.isnull().values.sum()
test_data.dropna(subset = ["name"], inplace=True)
test_data.isnull().values.sum()
print("Train set: " + str(train_data.shape))
print("Test set: " + str(test_data.shape))

## Dictionary of word counts

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(token_pattern = r'\b\w+\b')
train_matrix = vectorizer.fit_transform(train_data['review_clean'])
test_matrix = vectorizer.transform(test_data['review_clean'])
print("Train matrix: " + str(train_matrix.shape))
print("Test matrix: " + str(test_matrix.shape))

## Logistic Regression Model

In [None]:
from sklearn.linear_model import LogisticRegression
sentiment_model = LogisticRegression(random_state=0).fit(train_matrix, train_data['sentiment'])
zero_elem = (sentiment_model.coef_ > 0).sum()
less_elem = (sentiment_model.coef_ <= 0).sum()
total_val = zero_elem + less_elem
print("Co-efficients with value more than 0: " + str(zero_elem) + " out of " + str(total_val))

In [None]:
sample_test_data = test_data[45:50]
sample_test_data

In [None]:
sample_test_matrix = vectorizer.transform(sample_test_data['review_clean'])
scores_sample = sentiment_model.decision_function(sample_test_matrix)
print(scores_sample)

### Mapping function based on scores

In [None]:
def check_score(input_array):
    result = []
    for x in input_array:
        if x >= 0:
            result.append(1)
        else:
            result.append(-1)
    return result
check_score(scores_sample)

In [None]:
#predictions = sentiment_model.predict(test_matrix)
final_scores_data = vectorizer.transform(test_data['review_clean'])
final_score = sentiment_model.decision_function(final_scores_data)
print(final_score)

In [None]:
df = pd.DataFrame(final_score, columns = ['Score'])
print(df.shape)
print(df.columns)
print(test_set.shape)
print(test_set.columns)

In [None]:
df_test = test_set
df_test.reset_index(drop = True, inplace = True)

In [None]:
#df2 = pd.concat([df, test_set])
df2 = pd.concat([df, test_set], axis = 1)
df_export = df2.sort_values(by='Score', ascending=False)
df_export.head(5)

## Export data (Top & Bot 20)

In [None]:
df_top = df_export.head(20)
#df_top.to_csv('top20.csv')
df_bottom = df_export.tail(20)
#df_bottom.to_csv('bottom20.csv')

## Calculate Accuracy

In [None]:
print("Scores dataframe shape: " + str(df.shape))
print("Test data sentiment as reference shape: " + str(test_set.shape))

In [None]:
accuracy_sentiment_model = sentiment_model.score(final_scores_data, test_set['sentiment'].values)
print("Sentiment model accuracy:\n" + str(accuracy_sentiment_model * 100) + " percent")

## Special Classifier

In [None]:
significant_words = ['love', 'great', 'easy', 'old', 'little', 'perfect', 'loves', 
      'well', 'able', 'car', 'broke', 'less', 'even', 'waste', 'disappointed', 
      'work', 'product', 'money', 'would', 'return']

In [None]:
vectorizer_word_subset = CountVectorizer(vocabulary = significant_words) # limit to 20 words
train_matrix_word_subset = vectorizer_word_subset.fit_transform(train_set['review_clean'])
test_matrix_word_subset = vectorizer_word_subset.transform(test_set['review_clean'])
print("Train matrix special: " + str(train_matrix_word_subset.shape))
print("Test matrix special: " + str(test_matrix_word_subset.shape))

## Simple Log-Reg model

In [None]:
simple_model = LogisticRegression(random_state=0).fit(train_matrix_word_subset, train_set['sentiment'])
zero_elem_simple = (simple_model.coef_ > 0).sum()
print(zero_elem_simple)

In [None]:
# Check the coefficients for 20 words used in model
simple_model.coef_