# Import libraries

In [10]:
import tensorflow as tf
import string
import re
import pandas as pd
import spacy
import nltk
from nltk.corpus import stopwords
from nltk import sent_tokenize, word_tokenize
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

# Load data

In [11]:
df_train = pd.read_csv ("hm_train.csv")
df_train.head()

Unnamed: 0,hmid,reflection_period,cleaned_hm,num_sentence,predicted_category
0,27673,24h,I went on a successful date with someone I fel...,1,affection
1,27674,24h,I was happy when my son got 90% marks in his e...,1,affection
2,27675,24h,I went to the gym this morning and did yoga.,1,exercise
3,27676,24h,We had a serious talk with some friends of our...,2,bonding
4,27677,24h,I went with grandchildren to butterfly display...,1,affection


In [12]:
df_test = pd.read_csv("hm_test.csv")
df_test.head()

Unnamed: 0,hmid,reflection_period,cleaned_hm,num_sentence
0,88305,3m,I spent the weekend in Chicago with my friends.,1
1,88306,3m,We moved back into our house after a remodel. ...,2
2,88307,3m,My fiance proposed to me in front of my family...,1
3,88308,3m,I ate lobster at a fancy restaurant with some ...,1
4,88309,3m,I went out to a nice restaurant on a date with...,5


# Preprocessing

### 1.Cleaning

In [13]:
def clean(data):
    datarm=[]
    for i in data['cleaned_hm']:
        punct = re.sub(r'\W', ' ', i)
        non_num= re.sub ('[0-9]', '', punct)
        newlines = str(non_num).replace("\n","")
        returns = str(newlines).replace("\r","")
        quotes = str(returns).replace("'","")
        datarm.append(quotes)
    return datarm

### 2.Lemmatization

In [90]:
def lemmatize(data):
    nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

    data_lemma = []
    for i in data:
        text = nlp(i)
        data_lemma.append([token.lemma_.lower().strip() for token in text])
    data_lemma =[' '.join(i) for i in data_lemma]
    return data_lemma

### 3.Stopwords

In [109]:
def remove_stopwords(data):
    stop_words = nltk.corpus.stopwords.words('english')
    stop_words.append('I')
    temp=[]
    for i in data:
        tokens=word_tokenize(i)
        stopped = [i for i in tokens if i not in stop_words]
        stopped = [''.join(i) for i in stopped]
        temp.append(stopped)
    temp =[' '.join(i) for i in temp]
    return temp

### Final preprocess function

In [114]:
def preprocess_data(data):
    cleaned=clean(data)
    data_cleaned=remove_stopwords(cleaned)
    lemmatized=lemmatize(data_cleaned)
    return lemmatized

In [115]:
X_train=preprocess_data(df_train)
X_train

['go successful date someone feel sympathy connection',
 'happy son get mark examination',
 'go gym morning yoga',
 'we serious talk friend flaky lately they understand good evening hang',
 'go grandchild butterfly display crohn conservatory',
 'meditate last night',
 'make new recipe peasant bread come spectacular',
 'get gift eld brother really surprising',
 'yesterday my moms birthday so enjoyed',
 'watch cupcake war three teen child',
 'come rd place call duty video game',
 'complete mile run without break it make feel strong',
 'go movie friend fun',
 'short gold make trade',
 'hear song it nearly impossible go angry happy look thought ease angry feel move direction happiness it may take long head positive direction youall world good',
 'my son perform well test preparation',
 'help neighbour fix car damage',
 'manage get final trophy game play',
 'a hot kiss girl friend last night make day',
 'my new bcaa come mail yay strawberry lemonade flavor aminos make heart happy',
 'got a 

In [93]:
X_test=preprocess_data(df_test)

In [94]:
y_train=df_train['predicted_category']

In [148]:
label_encoder = LabelEncoder()
y_encode = label_encoder.fit_transform(y_train)
y=pd.DataFrame(y_encode)

### Vectorizing

### Bag of words

In [117]:
Xtrain_clean=pd.DataFrame({'data': X_train})
bow = CountVectorizer()          
bow.fit(Xtrain_clean['data'])               
X_bow = bow.transform(Xtrain_clean['data']) 

### TF IDF

In [138]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()
X_tfidf = tfidf.fit_transform(Xtrain_clean['data']) # vectorize feayures using tfidf

### Prediction accuracy function

In [118]:
def pred(X,y,model):

    X_train, X_test, y_train, y_test = train_test_split( X,
                                                         y,
                                                         test_size=0.1,
                                                         random_state=1) 


    model.fit(X_train, y_train) 
    model = model.predict(X_test) 
    return accuracy_score(y_test, model)

### KNeighborsClassifier

In [144]:
knn = KNeighborsClassifier() 
bow_accuracy=pred(X_bow,y,knn)

tfidf_accuracy=pred(X_tfidf,y,knn)


  return self._fit(X, y)
  return self._fit(X, y)


In [145]:
bow_accuracy

0.7410906679927067

In [146]:
tfidf_accuracy

0.7062821150339798

### Logistic Regression

In [153]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression() 
bow_accuracy=pred(X_bow,y,lr)
tfidf_accuracy=pred(X_tfidf,y,lr)

  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [154]:
bow_accuracy

0.892756505884303

In [155]:
tfidf_accuracy

0.8833084700812199

### Random Forest Classifier

In [167]:
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier()
bow_accuracy=pred(X_bow,y,forest)
tfidf_accuracy=pred(X_tfidf,y,forest)

  model.fit(X_train, y_train)
  model.fit(X_train, y_train)


In [168]:
bow_accuracy

0.8486656721365822

In [169]:
tfidf_accuracy

0.8519807724183657

### Accuracy after balancing data

In [157]:
from imblearn.over_sampling import SMOTE

oversampling = SMOTE(sampling_strategy='auto')
X_new, y_new = oversampling.fit_resample(X_tfidf, y) 
y_new.value_counts()

0    20880
1    20880
2    20880
3    20880
4    20880
5    20880
6    20880
dtype: int64

In [158]:
lraccuracy=pred(X_new, y_new,lr)
accuracy

  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.9432129173508483

In [160]:
knnaccuracy=pred(X_new, y_new, knn)
knnaccuracy

  return self._fit(X, y)


0.8293650793650794

In [170]:
rfaccuracy=pred(X_new, y_new, forest)
rfaccuracy

  model.fit(X_train, y_train)


0.9505336617405583