In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC, LinearSVC
import tensorflow as tf
from sklearn.manifold import TSNE
from sklearn.decomposition import TruncatedSVD
import matplotlib.pyplot as plt
import os
import numba as nb
import spacy
from sklearn import preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier

In [None]:
import nltk
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
print(lemmatizer.lemmatize("does"))
from nltk.corpus import wordnet
nltk.download('averaged_perceptron_tagger')

### import data

In [None]:
df = pd.read_csv('training.1600000.processed.noemoticon.csv',
                 encoding='ISO-8859-1', 
                 names=[
                        'target',
                        'id',
                        'date',
                        'flag',
                        'user',
                        'text'
                        ])

In [None]:
df.head(10)

In [None]:
print(df.text[0])

In [None]:
le = preprocessing.LabelEncoder()
le.fit(df.target)
df.target=le.transform(df.target)
df.head(10)

In [None]:
x = df.text.values
y = df.target.values
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.15, random_state=32)

In [None]:
print(x_train[174])

In [None]:
print(x_train[0])

import re
for i in range(x_train.shape[0]):
    x_train[i] = re.sub(r'[^A-Za-z]+', ' ', x_train[i])
for i in range(x_test.shape[0]):
    x_test[i] = re.sub(r'[^A-Za-z]+', ' ', x_test[i])

In [None]:

def nltk_tag_to_wordnet_tag(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:
        return None

def lemmatize_data(data):
    lemmatizer = WordNetLemmatizer()
    # tokenize the sentence and find the POS tag for each token
    lemmatized_data=[]
    for i in range(data.shape[0]):
        tagged_data=nltk.pos_tag(nltk.word_tokenize(data[i]))
    # tuple of (token, wordnet_tag)
        wordnet_tagged = map(lambda x: (
            x[0], nltk_tag_to_wordnet_tag(x[1])), tagged_data)
        lemmatized_sentence = []
        for word, tag in wordnet_tagged:
            if tag is None:  # if there is no available tag, append the token as is
                lemmatized_sentence.append(word)
            else:# else use the tag to lemmatize the token
                lemmatized_sentence.append(lemmatizer.lemmatize(word, tag))
        lemmatized_data.append(" ".join(lemmatized_sentence))
    return lemmatized_data       

In [None]:
x_train_lm=lemmatize_data(x_train)
x_test_lm=lemmatize_data(x_test)

In [None]:
vectorizer=TfidfVectorizer()
vectorizer.fit(x_train_lm)
x_train_td = vectorizer.transform(x_train_lm)
x_test_td = vectorizer.transform(x_test_lm)

In [None]:
rand_arr = np.arange(x_train_td.shape[0])
data_to_svd=x_train_td[rand_arr[0:3000]]

In [None]:
print(data_to_svd.shape)

In [None]:
classifier = SVC(max_iter=2000,kernel='rbf',decision_function_shape='ovr')
classifier.fit(x_train_td, y_train)

score = classifier.score(x_test_td, y_test)

print("Accuracy:", score)

import umap

fit = umap.UMAP(n_neighbors=20,
                min_dist=0.1,
                n_components=3,
                metric='hellinger',
                )
rand_arr = np.arange(X_train.shape[0])
X_train_ump=X_train[rand_arr[0:3000]]
%time embedding = fit.fit_transform(X_train_ump)

In [None]:
%%time
svd = TruncatedSVD(n_components=200, n_iter=1000)
data_svd = svd.fit_transform(data_to_svd)
print (type(data_svd))
print(data_svd.shape)

#print(data_svd1.shape)

In [None]:
%%time
tsne = TSNE(
    n_components=3,
    n_iter=1000,
    learning_rate=200,
    perplexity=50,
    random_state=1132,
    verbose=1
)

embedding = tsne.fit_transform(data_to_svd)

embedding_df = pd.DataFrame(embedding, columns=["x", "y", "z"])
print(embedding_df.shape)

In [None]:
import os
os.getcwd()
embedding_df.to_csv('embedding_df.csv')

In [None]:
print(y_train[0:100])

In [None]:
import plotly.graph_objects as go
scatter = go.Scatter3d(
    name=str(embedding_df.index),
    x=embedding_df["x"],
    y=embedding_df["y"],
    z=embedding_df["z"],
    text=x_train,
    hoverinfo = 'text',
    textposition="middle center",
    showlegend=False,
    mode="markers",
    ids=y_train,
    marker=dict(size=3, color=y_train, symbol="circle"),
)
figure = go.Figure(data=[scatter])
figure.show()