### "Amazon-Alexa" text classification using custom trained Word Embeddings ( w/ gensim )

#### https://www.kaggle.com/datasets/sid321axn/amazon-alexa-reviews

### import the required libraries

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline

In [None]:
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec
from gensim.models import KeyedVectors

In [None]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words=set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


### Load the input data ( "amazon alexa reviews data")

In [None]:
# Loading TSV file
df_amazon = pd.read_csv ("amazon_alexa.tsv", sep="\t")

In [None]:
# Top 5 records
df_amazon.head()

Unnamed: 0,rating,date,variation,verified_reviews,feedback
0,5,31-Jul-18,Charcoal Fabric,Love my Echo!,1
1,5,31-Jul-18,Charcoal Fabric,Loved it!,1
2,4,31-Jul-18,Walnut Finish,"Sometimes while playing a game, you can answer...",1
3,5,31-Jul-18,Charcoal Fabric,I have had a lot of fun with this thing. My 4 ...,1
4,5,31-Jul-18,Charcoal Fabric,Music,1


In [None]:
df_amazon_mod = df_amazon[['verified_reviews','feedback']]
df_amazon_mod.head()

Unnamed: 0,verified_reviews,feedback
0,Love my Echo!,1
1,Loved it!,1
2,"Sometimes while playing a game, you can answer...",1
3,I have had a lot of fun with this thing. My 4 ...,1
4,Music,1


### Create the tokens directly from the NLTK library

In [None]:
import re
def clean(string):
    string = str(string)
    cleanString = re.sub('[^A-Za-z]+',' ', string )
    return cleanString

df_amazon_mod['reviews_non_numeric'] = df_amazon_mod['verified_reviews'].map(clean)
df_amazon_mod['reviews_len_trim'] = df_amazon_mod['reviews_non_numeric'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>2]))
df_amazon_mod['reviews_Tokenized'] = df_amazon_mod['reviews_len_trim'].map(lambda x: x.lower().split())
df_amazon_mod['reviews_Tokenized_stop'] = df_amazon_mod['reviews_Tokenized'].map(lambda x: " ".join(x for x in x if x not in stop_words))
df_amazon_mod['reviews_Tokenized_stop'] = df_amazon_mod['reviews_Tokenized_stop'].map(lambda x: x.split())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_amazon_mod['reviews_non_numeric'] = df_amazon_mod['verified_reviews'].map(clean)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_amazon_mod['reviews_len_trim'] = df_amazon_mod['reviews_non_numeric'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>2]))


In [None]:
df_amazon_mod

Unnamed: 0,verified_reviews,feedback,reviews_non_numeric,reviews_len_trim,reviews_Tokenized,reviews_Tokenized_stop
0,Love my Echo!,1,Love my Echo,Love Echo,"[love, echo]","[love, echo]"
1,Loved it!,1,Loved it,Loved,[loved],[loved]
2,"Sometimes while playing a game, you can answer...",1,Sometimes while playing a game you can answer ...,Sometimes while playing game you can answer qu...,"[sometimes, while, playing, game, you, can, an...","[sometimes, playing, game, answer, question, c..."
3,I have had a lot of fun with this thing. My 4 ...,1,I have had a lot of fun with this thing My yr ...,have had lot fun with this thing old learns ab...,"[have, had, lot, fun, with, this, thing, old, ...","[lot, fun, thing, old, learns, dinosaurs, cont..."
4,Music,1,Music,Music,[music],[music]
...,...,...,...,...,...,...
3145,"Perfect for kids, adults and everyone in betwe...",1,Perfect for kids adults and everyone in between,Perfect for kids adults and everyone between,"[perfect, for, kids, adults, and, everyone, be...","[perfect, kids, adults, everyone]"
3146,"Listening to music, searching locations, check...",1,Listening to music searching locations checkin...,Listening music searching locations checking t...,"[listening, music, searching, locations, check...","[listening, music, searching, locations, check..."
3147,"I do love these things, i have them running my...",1,I do love these things i have them running my ...,love these things have them running entire hom...,"[love, these, things, have, them, running, ent...","[love, things, running, entire, home, lights, ..."
3148,Only complaint I have is that the sound qualit...,1,Only complaint I have is that the sound qualit...,Only complaint have that the sound quality isn...,"[only, complaint, have, that, the, sound, qual...","[complaint, sound, quality, great, mostly, use..."


In [None]:
gensim_custom_model = Word2Vec(df_amazon_mod['reviews_Tokenized_stop'],
                 min_count=1, # ignores words that appear less than min_count
                vector_size = 50,   # Dimensionality of words embeddings
                workers = 1,  # Number of processors ( for parallelization )
                window = 5,   # Context window for words during training
                epochs=30)      # Number of epochs training over corpus
# summarize the loaded model
print(gensim_custom_model)
# summarize vocabulary
words = gensim_custom_model.wv.index_to_key
#print(words)
# access vector for one word
#print(gensim_custom_model['sentence'])
# save model
gensim_custom_model.save('gensim_embed_model.bin')
# load model
new_model = Word2Vec.load('gensim_embed_model.bin')
print(new_model)

Word2Vec<vocab=3766, vector_size=50, alpha=0.025>
Word2Vec<vocab=3766, vector_size=50, alpha=0.025>


In [None]:
print(len(new_model.wv['playing']))
print(new_model.wv['playing'])

50
[ 1.1390572   0.69367504  0.35405543 -0.3360351  -1.6156601  -0.67571414
  0.0082701   1.3697423  -1.241904    0.4051194   0.43244246 -0.4759944
 -0.5358678  -0.25197065 -0.8354415  -0.5328631  -0.45839345 -0.4699805
 -1.864845    1.1357853  -0.04019342  0.56394374  3.1011941   0.22915293
  1.6797787  -0.5109043   0.5957811  -1.122258   -0.31785893  1.4147567
  0.2542143  -0.51969314 -0.90367746 -0.97077984  0.69434416  0.7134593
 -0.94251066  0.65927553 -0.77655417 -0.7325376  -0.5880368   0.64481336
 -0.05699458  0.03114527  1.572525    0.8537959   1.1929244  -0.15653615
 -0.933787   -0.4912228 ]


In [None]:
# Display the words that are most relevant
new_model.wv.most_similar('playing')

[('personally', 0.8814302086830139),
 ('listening', 0.86029052734375),
 ('timers', 0.8563851714134216),
 ('photos', 0.8408114910125732),
 ('pandora', 0.8273612856864929),
 ('play', 0.8252195119857788),
 ('videos', 0.824177622795105),
 ('lyrics', 0.8007113337516785),
 ('books', 0.7935694456100464),
 ('duty', 0.7889806628227234)]

### Aggregted sentence vector for each sentence based on the word vectors

In [None]:
words = set(new_model.wv.index_to_key)
#df_amazon_mod['reviews_Vect'] = np.array([np.array([new_model.wv[i] for i in ls if i in words])for ls in df_amazon_mod['reviews_Tokenized']])

In [None]:
df_amazon_mod['reviews_vect'] = np.array([np.array([new_model.wv[i] for i in ls if i in words])
                       for ls in df_amazon_mod['reviews_Tokenized_stop']], dtype=object)

In [None]:
df_amazon_mod.head()

Unnamed: 0,verified_reviews,feedback,reviews_non_numeric,reviews_len_trim,reviews_Tokenized,reviews_Tokenized_stop,reviews_vect
0,Love my Echo!,1,Love my Echo,Love Echo,"[love, echo]","[love, echo]","[[0.65777403, -0.33503002, -0.2571375, 0.51623..."
1,Loved it!,1,Loved it,Loved,[loved],[loved],"[[0.09256516, -0.5539267, -0.28128806, 0.23581..."
2,"Sometimes while playing a game, you can answer...",1,Sometimes while playing a game you can answer ...,Sometimes while playing game you can answer qu...,"[sometimes, while, playing, game, you, can, an...","[sometimes, playing, game, answer, question, c...","[[0.87465954, -0.23909982, 0.07640512, 0.05844..."
3,I have had a lot of fun with this thing. My 4 ...,1,I have had a lot of fun with this thing My yr ...,have had lot fun with this thing old learns ab...,"[have, had, lot, fun, with, this, thing, old, ...","[lot, fun, thing, old, learns, dinosaurs, cont...","[[0.7744475, 0.3352178, -0.59644914, -1.099561..."
4,Music,1,Music,Music,[music],[music],"[[1.9074738, 1.5858399, -0.42845234, -0.911543..."


In [None]:
vector_size_n_w2v = 50
text_vect_avg = []
for v in df_amazon_mod['reviews_vect']:
    if v.size:
        text_vect_avg.append(v.mean(axis=0))
    else:
        text_vect_avg.append(np.zeros(vector_size_n_w2v, dtype=float)) # the same vector size must be used here as for model training


df_amazon_mod['reviews_vect_avg'] = text_vect_avg
df_amazon_mod.head()

Unnamed: 0,verified_reviews,feedback,reviews_non_numeric,reviews_len_trim,reviews_Tokenized,reviews_Tokenized_stop,reviews_vect,reviews_vect_avg
0,Love my Echo!,1,Love my Echo,Love Echo,"[love, echo]","[love, echo]","[[0.65777403, -0.33503002, -0.2571375, 0.51623...","[0.18344745, 0.0499101, -0.68512803, 0.3177009..."
1,Loved it!,1,Loved it,Loved,[loved],[loved],"[[0.09256516, -0.5539267, -0.28128806, 0.23581...","[0.09256516, -0.5539267, -0.28128806, 0.235816..."
2,"Sometimes while playing a game, you can answer...",1,Sometimes while playing a game you can answer ...,Sometimes while playing game you can answer qu...,"[sometimes, while, playing, game, you, can, an...","[sometimes, playing, game, answer, question, c...","[[0.87465954, -0.23909982, 0.07640512, 0.05844...","[0.46043295, 0.08474762, -0.35956123, 0.506331..."
3,I have had a lot of fun with this thing. My 4 ...,1,I have had a lot of fun with this thing My yr ...,have had lot fun with this thing old learns ab...,"[have, had, lot, fun, with, this, thing, old, ...","[lot, fun, thing, old, learns, dinosaurs, cont...","[[0.7744475, 0.3352178, -0.59644914, -1.099561...","[0.5424106, 0.34815583, -0.09817438, 0.1525538..."
4,Music,1,Music,Music,[music],[music],"[[1.9074738, 1.5858399, -0.42845234, -0.911543...","[1.9074738, 1.5858399, -0.42845234, -0.9115436..."


In [None]:
df_Machine_Learning = pd.DataFrame(text_vect_avg)
df_Machine_Learning

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,40,41,42,43,44,45,46,47,48,49
0,0.183447,0.049910,-0.685128,0.317701,0.674429,-0.439996,0.674290,0.324819,-0.321528,0.153714,...,0.696998,0.518812,-0.114208,0.168091,2.142874,0.179845,-1.024108,-0.676871,-0.206912,0.783130
1,0.092565,-0.553927,-0.281288,0.235816,0.878686,-0.235642,0.812565,0.809152,-0.622868,-0.206145,...,1.282025,-0.029557,-0.389590,0.300778,0.864906,0.287106,-0.504666,-0.466373,0.768607,0.347241
2,0.460433,0.084748,-0.359561,0.506332,-0.395053,-0.982635,0.617807,1.076343,-0.881969,0.527063,...,0.385097,0.329379,0.014070,-0.258172,0.844281,0.114003,0.378949,0.051351,0.299294,0.020001
3,0.542411,0.348156,-0.098174,0.152554,-0.358861,-0.679937,0.413263,0.975243,-1.076365,0.091955,...,0.153948,0.357252,-0.008301,-0.060838,0.995074,0.185298,0.529442,-0.345396,0.119140,-0.216673
4,1.907474,1.585840,-0.428452,-0.911544,-0.024541,-0.315669,-0.337547,1.484706,-3.001509,2.006122,...,-1.107329,0.046144,0.300611,1.068219,1.293279,0.116305,1.022116,-0.868205,-0.537958,-1.106072
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3145,0.126783,-0.114448,-0.208630,0.268718,0.056351,-0.378456,0.186587,0.845294,-0.534137,0.048709,...,0.678127,0.036600,-0.169014,0.157022,0.770087,0.318363,0.008067,-0.311125,0.322327,0.100904
3146,0.775892,0.653427,0.264275,0.124402,-0.788117,-0.375994,0.558324,0.438883,-1.419436,0.130209,...,0.093783,0.530963,0.075610,-0.400870,1.384609,-0.408064,0.408309,-0.438706,0.003314,0.017075
3147,0.019368,0.374065,-0.342401,0.562537,-0.145342,-0.852943,0.238624,0.756191,-0.755674,-0.036188,...,0.307533,0.124421,-0.033778,0.067836,0.899150,0.095463,0.156139,-0.039287,0.465922,0.212604
3148,0.161429,-0.055128,-0.275274,-0.023689,0.407176,-0.525599,0.387760,0.926500,-0.572069,-0.217996,...,0.119862,0.012803,-0.058570,0.111109,1.009488,0.439674,-0.178697,-0.445856,0.418319,-0.106589


In [None]:
df_Machine_Learning.columns = ['Col_' + str(i+1) for i in range(0, df_Machine_Learning.shape[1])]
df_Machine_Learning

Unnamed: 0,Col_1,Col_2,Col_3,Col_4,Col_5,Col_6,Col_7,Col_8,Col_9,Col_10,...,Col_41,Col_42,Col_43,Col_44,Col_45,Col_46,Col_47,Col_48,Col_49,Col_50
0,0.183447,0.049910,-0.685128,0.317701,0.674429,-0.439996,0.674290,0.324819,-0.321528,0.153714,...,0.696998,0.518812,-0.114208,0.168091,2.142874,0.179845,-1.024108,-0.676871,-0.206912,0.783130
1,0.092565,-0.553927,-0.281288,0.235816,0.878686,-0.235642,0.812565,0.809152,-0.622868,-0.206145,...,1.282025,-0.029557,-0.389590,0.300778,0.864906,0.287106,-0.504666,-0.466373,0.768607,0.347241
2,0.460433,0.084748,-0.359561,0.506332,-0.395053,-0.982635,0.617807,1.076343,-0.881969,0.527063,...,0.385097,0.329379,0.014070,-0.258172,0.844281,0.114003,0.378949,0.051351,0.299294,0.020001
3,0.542411,0.348156,-0.098174,0.152554,-0.358861,-0.679937,0.413263,0.975243,-1.076365,0.091955,...,0.153948,0.357252,-0.008301,-0.060838,0.995074,0.185298,0.529442,-0.345396,0.119140,-0.216673
4,1.907474,1.585840,-0.428452,-0.911544,-0.024541,-0.315669,-0.337547,1.484706,-3.001509,2.006122,...,-1.107329,0.046144,0.300611,1.068219,1.293279,0.116305,1.022116,-0.868205,-0.537958,-1.106072
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3145,0.126783,-0.114448,-0.208630,0.268718,0.056351,-0.378456,0.186587,0.845294,-0.534137,0.048709,...,0.678127,0.036600,-0.169014,0.157022,0.770087,0.318363,0.008067,-0.311125,0.322327,0.100904
3146,0.775892,0.653427,0.264275,0.124402,-0.788117,-0.375994,0.558324,0.438883,-1.419436,0.130209,...,0.093783,0.530963,0.075610,-0.400870,1.384609,-0.408064,0.408309,-0.438706,0.003314,0.017075
3147,0.019368,0.374065,-0.342401,0.562537,-0.145342,-0.852943,0.238624,0.756191,-0.755674,-0.036188,...,0.307533,0.124421,-0.033778,0.067836,0.899150,0.095463,0.156139,-0.039287,0.465922,0.212604
3148,0.161429,-0.055128,-0.275274,-0.023689,0.407176,-0.525599,0.387760,0.926500,-0.572069,-0.217996,...,0.119862,0.012803,-0.058570,0.111109,1.009488,0.439674,-0.178697,-0.445856,0.418319,-0.106589


In [None]:
final_df = pd.concat([df_amazon_mod[['verified_reviews','feedback']], df_Machine_Learning], axis=1, sort=False)
final_df

Unnamed: 0,verified_reviews,feedback,Col_1,Col_2,Col_3,Col_4,Col_5,Col_6,Col_7,Col_8,...,Col_41,Col_42,Col_43,Col_44,Col_45,Col_46,Col_47,Col_48,Col_49,Col_50
0,Love my Echo!,1,0.183447,0.049910,-0.685128,0.317701,0.674429,-0.439996,0.674290,0.324819,...,0.696998,0.518812,-0.114208,0.168091,2.142874,0.179845,-1.024108,-0.676871,-0.206912,0.783130
1,Loved it!,1,0.092565,-0.553927,-0.281288,0.235816,0.878686,-0.235642,0.812565,0.809152,...,1.282025,-0.029557,-0.389590,0.300778,0.864906,0.287106,-0.504666,-0.466373,0.768607,0.347241
2,"Sometimes while playing a game, you can answer...",1,0.460433,0.084748,-0.359561,0.506332,-0.395053,-0.982635,0.617807,1.076343,...,0.385097,0.329379,0.014070,-0.258172,0.844281,0.114003,0.378949,0.051351,0.299294,0.020001
3,I have had a lot of fun with this thing. My 4 ...,1,0.542411,0.348156,-0.098174,0.152554,-0.358861,-0.679937,0.413263,0.975243,...,0.153948,0.357252,-0.008301,-0.060838,0.995074,0.185298,0.529442,-0.345396,0.119140,-0.216673
4,Music,1,1.907474,1.585840,-0.428452,-0.911544,-0.024541,-0.315669,-0.337547,1.484706,...,-1.107329,0.046144,0.300611,1.068219,1.293279,0.116305,1.022116,-0.868205,-0.537958,-1.106072
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3145,"Perfect for kids, adults and everyone in betwe...",1,0.126783,-0.114448,-0.208630,0.268718,0.056351,-0.378456,0.186587,0.845294,...,0.678127,0.036600,-0.169014,0.157022,0.770087,0.318363,0.008067,-0.311125,0.322327,0.100904
3146,"Listening to music, searching locations, check...",1,0.775892,0.653427,0.264275,0.124402,-0.788117,-0.375994,0.558324,0.438883,...,0.093783,0.530963,0.075610,-0.400870,1.384609,-0.408064,0.408309,-0.438706,0.003314,0.017075
3147,"I do love these things, i have them running my...",1,0.019368,0.374065,-0.342401,0.562537,-0.145342,-0.852943,0.238624,0.756191,...,0.307533,0.124421,-0.033778,0.067836,0.899150,0.095463,0.156139,-0.039287,0.465922,0.212604
3148,Only complaint I have is that the sound qualit...,1,0.161429,-0.055128,-0.275274,-0.023689,0.407176,-0.525599,0.387760,0.926500,...,0.119862,0.012803,-0.058570,0.111109,1.009488,0.439674,-0.178697,-0.445856,0.418319,-0.106589


In [None]:
# Logistic Regression Classifier
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier.fit(df_Machine_Learning, final_df['feedback'])

In [None]:
import pickle as pk
pk.dump(classifier, open('clf_model.pkl', 'wb'))

In [None]:
w2v_model_reloaded = Word2Vec.load("gensim_embed_model.bin")
vector_size_n_reloaded = 50
#vector_size_n_reloaded = pk.load(open("word2vec/vector_size_w2v_metric.pkl",'rb'))

print(w2v_model_reloaded)
print(vector_size_n_reloaded)

Word2Vec<vocab=3766, vector_size=50, alpha=0.025>
50


### Predictions on New Test data

In [None]:
new_input = ["Flowers I like to see in the park especially sunflowers",
             "I like flowers"]

print(new_input[0])
print(new_input[1])

Flowers I like to see in the park especially sunflowers
I like flowers


In [None]:
new_input_df = pd.DataFrame(new_input, columns=['New_Input'])
new_input_df

Unnamed: 0,New_Input
0,Flowers I like to see in the park especially s...
1,I like flowers


### Data processing

In [None]:
import re
def clean(string):
    string = str(string)
    cleanString = re.sub('[^A-Za-z]+',' ', string )
    return cleanString

new_input_df['reviews_non_numeric'] = new_input_df['New_Input'].map(clean)
new_input_df['reviews_len_trim'] = new_input_df['reviews_non_numeric'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>2]))
new_input_df['reviews_Tokenized'] = new_input_df['reviews_len_trim'].map(lambda x: x.lower().split())
new_input_df['reviews_Tokenized_stop'] = new_input_df['reviews_Tokenized'].map(lambda x: " ".join(x for x in x if x not in stop_words))
new_input_df['reviews_Tokenized_stop'] = new_input_df['reviews_Tokenized_stop'].map(lambda x: x.split())

In [None]:
new_input_df

Unnamed: 0,New_Input,reviews_non_numeric,reviews_len_trim,reviews_Tokenized,reviews_Tokenized_stop
0,Flowers I like to see in the park especially s...,Flowers I like to see in the park especially s...,Flowers like see the park especially sunflowers,"[flowers, like, see, the, park, especially, su...","[flowers, like, see, park, especially, sunflow..."
1,I like flowers,I like flowers,like flowers,"[like, flowers]","[like, flowers]"


In [None]:
words = set(w2v_model_reloaded.wv.index_to_key )
new_input_df['New_Input_vect'] = np.array([np.array([w2v_model_reloaded.wv[i] for i in ls if i in words])
                                           for ls in new_input_df['reviews_Tokenized_stop']], dtype=object)


new_input_df

Unnamed: 0,New_Input,reviews_non_numeric,reviews_len_trim,reviews_Tokenized,reviews_Tokenized_stop,New_Input_vect
0,Flowers I like to see in the park especially s...,Flowers I like to see in the park especially s...,Flowers like see the park especially sunflowers,"[flowers, like, see, the, park, especially, su...","[flowers, like, see, park, especially, sunflow...","[[0.75755143, -0.06675574, -1.1178535, -0.0252..."
1,I like flowers,I like flowers,like flowers,"[like, flowers]","[like, flowers]","[[0.75755143, -0.06675574, -1.1178535, -0.0252..."


In [None]:
text_vect_avg = []
for v in new_input_df['New_Input_vect']:
    if v.size:
        text_vect_avg.append(v.mean(axis=0))
    else:
        text_vect_avg.append(np.zeros(vector_size_n_reloaded, dtype=float)) # the same vector size must be used here as for model training


new_input_df['New_Input_vect_avg'] = text_vect_avg
new_input_df

Unnamed: 0,New_Input,reviews_non_numeric,reviews_len_trim,reviews_Tokenized,reviews_Tokenized_stop,New_Input_vect,New_Input_vect_avg
0,Flowers I like to see in the park especially s...,Flowers I like to see in the park especially s...,Flowers like see the park especially sunflowers,"[flowers, like, see, the, park, especially, su...","[flowers, like, see, park, especially, sunflow...","[[0.75755143, -0.06675574, -1.1178535, -0.0252...","[0.4494255, 0.0013823854, -0.40411243, 0.10564..."
1,I like flowers,I like flowers,like flowers,"[like, flowers]","[like, flowers]","[[0.75755143, -0.06675574, -1.1178535, -0.0252...","[0.75755143, -0.06675574, -1.1178535, -0.02525..."


In [None]:
new_input_Machine_Learning_df = pd.DataFrame(text_vect_avg)
new_input_Machine_Learning_df.columns = ['Col_' + str(i+1) for i in range(0, new_input_Machine_Learning_df.shape[1])]
new_input_Machine_Learning_df

Unnamed: 0,Col_1,Col_2,Col_3,Col_4,Col_5,Col_6,Col_7,Col_8,Col_9,Col_10,...,Col_41,Col_42,Col_43,Col_44,Col_45,Col_46,Col_47,Col_48,Col_49,Col_50
0,0.449425,0.001382,-0.404112,0.105644,-0.373763,-0.472068,0.471721,0.592776,-0.960902,0.193734,...,-0.097839,0.178007,-0.15026,0.040186,0.549433,0.448335,0.336928,-0.335155,0.148022,-0.015995
1,0.757551,-0.066756,-1.117854,-0.025259,0.26145,-0.912208,0.571991,0.264955,-0.907752,-0.098239,...,-0.620555,0.864116,0.13752,-0.628806,0.7441,0.689251,0.553172,-0.06253,0.392097,-0.019398


In [None]:
final_new_input_df = pd.concat([new_input_df[['New_Input']], new_input_Machine_Learning_df], axis=1, sort=False)
final_new_input_df

Unnamed: 0,New_Input,Col_1,Col_2,Col_3,Col_4,Col_5,Col_6,Col_7,Col_8,Col_9,...,Col_41,Col_42,Col_43,Col_44,Col_45,Col_46,Col_47,Col_48,Col_49,Col_50
0,Flowers I like to see in the park especially s...,0.449425,0.001382,-0.404112,0.105644,-0.373763,-0.472068,0.471721,0.592776,-0.960902,...,-0.097839,0.178007,-0.15026,0.040186,0.549433,0.448335,0.336928,-0.335155,0.148022,-0.015995
1,I like flowers,0.757551,-0.066756,-1.117854,-0.025259,0.26145,-0.912208,0.571991,0.264955,-0.907752,...,-0.620555,0.864116,0.13752,-0.628806,0.7441,0.689251,0.553172,-0.06253,0.392097,-0.019398


In [None]:
clf_reloaded = pk.load(open("clf_model.pkl",'rb'))

y_pred = clf_reloaded.predict(new_input_Machine_Learning_df)
y_pred
final_new_input_df['Prediction'] = y_pred
final_new_input_df

Unnamed: 0,New_Input,Col_1,Col_2,Col_3,Col_4,Col_5,Col_6,Col_7,Col_8,Col_9,...,Col_42,Col_43,Col_44,Col_45,Col_46,Col_47,Col_48,Col_49,Col_50,Prediction
0,Flowers I like to see in the park especially s...,0.449425,0.001382,-0.404112,0.105644,-0.373763,-0.472068,0.471721,0.592776,-0.960902,...,0.178007,-0.15026,0.040186,0.549433,0.448335,0.336928,-0.335155,0.148022,-0.015995,1
1,I like flowers,0.757551,-0.066756,-1.117854,-0.025259,0.26145,-0.912208,0.571991,0.264955,-0.907752,...,0.864116,0.13752,-0.628806,0.7441,0.689251,0.553172,-0.06253,0.392097,-0.019398,1


In [None]:
final_new_input_df[['New_Input','Prediction']]

Unnamed: 0,New_Input,Prediction
0,Flowers I like to see in the park especially s...,1
1,I like flowers,1
