In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf

from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
import re
import nltk
nltk.download('punkt')
nltk.download('stopwords')

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [3]:
df = pd.read_csv("/content/DemoProfiles.csv")

In [4]:
df.head()

Unnamed: 0,first_name,last_name,company,position,industry,location
0,John,Doe,ABC Corp,Marketing Manager,Technology,San Francisco
1,Jane,Smith,XYZ Inc,Social Media Specialist,Advertising & Marketing,New York
2,Michael,Johnson,123 Company,Digital Marketing Analyst,Consulting,Chicago
3,Sarah,Williams,ABC Corp,Content Writer,Media & Publishing,London
4,David,Brown,XYZ Inc,Brand Manager,Consumer Goods,Miami


In [5]:
def preprocess(text):

  text = text.lower()

  text = re.sub(r"http\S+", "", text)

  text = re.sub(r"www\S+", "", text)

  text = re.sub(r"[^a-zA-Z0-9]", " ", text)

  text = re.sub(r"[^\w\s]", " ",text)

  tokens = word_tokenize(text)

  stop_words = stopwords.words("english")

  filtered_tokens = [token for token in tokens if token not in stop_words]

  stemmer = PorterStemmer()

  stemmed_tokens =  [stemmer.stem(token) for token in filtered_tokens]

  preprocessed_text = " ".join(stemmed_tokens)

  return preprocessed_text


In [6]:
import nltk
nltk.download('punkt_tab')
df['position_new'] = df['position'].apply(preprocess)

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [7]:
df.head()

Unnamed: 0,first_name,last_name,company,position,industry,location,position_new
0,John,Doe,ABC Corp,Marketing Manager,Technology,San Francisco,market manag
1,Jane,Smith,XYZ Inc,Social Media Specialist,Advertising & Marketing,New York,social media specialist
2,Michael,Johnson,123 Company,Digital Marketing Analyst,Consulting,Chicago,digit market analyst
3,Sarah,Williams,ABC Corp,Content Writer,Media & Publishing,London,content writer
4,David,Brown,XYZ Inc,Brand Manager,Consumer Goods,Miami,brand manag


In [8]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['position_new'])

In [9]:
tokenized_texts = [tokenizer.texts_to_sequences([text])[0] for text in df['position_new'] ]

In [10]:
ma = max([len(tokenizedText) for tokenizedText in tokenized_texts ])

In [11]:
processedSequences = pad_sequences(tokenized_texts, maxlen=ma, padding='post')

In [12]:
from sklearn.preprocessing import normalize
normalized_sequences = normalize(processedSequences, norm='l2', axis=1)

In [13]:
unique_labels = df['industry'].unique()

In [14]:
unique_labels

array(['Technology', 'Advertising & Marketing', 'Consulting',
       'Media & Publishing', 'Consumer Goods', 'E-commerce',
       'Fashion & Apparel', 'Beauty & Cosmetics', 'Market Research',
       ' Marketing Coordinator'], dtype=object)

In [15]:
from sklearn.preprocessing import LabelEncoder

In [16]:
encoder = LabelEncoder()

In [17]:
labels = encoder.fit_transform(df["industry"])

In [18]:
onehotlabels = tf.keras.utils.to_categorical(labels)

In [19]:
from sklearn.model_selection import train_test_split

In [20]:
x_train, x_test, y_train, y_test = train_test_split(normalized_sequences, onehotlabels, test_size=0.2, random_state=42)

In [74]:
model = tf.keras.Sequential()

In [75]:
noofwords = len(tokenizer.word_index)+1

In [76]:
model.add(tf.keras.layers.Embedding(noofwords, 100, input_length=ma))

In [77]:
model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(16, return_sequences=True)))

In [78]:
model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(4, return_sequences=True)))

In [79]:
model.add(tf.keras.layers.Flatten())

In [80]:
model.add(tf.keras.layers.Dense(max(labels)+1, activation='softmax'))

In [81]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [82]:
model.fit(x_train, y_train, epochs=100)

Epoch 1/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 19ms/step - accuracy: 0.1469 - loss: 2.3012
Epoch 2/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step - accuracy: 0.1391 - loss: 2.2939 
Epoch 3/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step - accuracy: 0.1547 - loss: 2.2853
Epoch 4/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step - accuracy: 0.1273 - loss: 2.2780
Epoch 5/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step - accuracy: 0.1195 - loss: 2.2759
Epoch 6/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step - accuracy: 0.1117 - loss: 2.2687
Epoch 7/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step - accuracy: 0.1312 - loss: 2.2556
Epoch 8/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step - accuracy: 0.1273 - loss: 2.2437
Epoch 9/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[

<keras.src.callbacks.history.History at 0x7f88e40b7c80>

In [56]:
model.evaluate(x_test, y_test)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 933ms/step - accuracy: 0.1000 - loss: 2.2272


[2.2271807193756104, 0.10000000149011612]