In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk, string
import numpy as np
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
%matplotlib inline
import json
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

df = pd.read_csv('data.csv')
df = df[~df['name'].str.contains("Thesis Research", na=False)] # Exclude Thesis stuff
# df = df[~df['name'].str.contains("LANG ", na=False)] # Exclude LANG courses
df.head(n=2)

Unnamed: 0.1,Unnamed: 0,dept,description,name
0,0,ACCT,Overview of accounting in business and social ...,"ACCT 1010 - Accounting, Business and Society (..."
1,1,ACCT,"For SB&amp;M students, and programs that desig...",ACCT 2010 - Principles of Accounting I (3 units)


In [None]:
documents = df['description'].as_matrix()
stemmer = nltk.stem.porter.PorterStemmer()
remove_punctuation_map = dict((ord(char), None) for char in string.punctuation)

def stem_tokens(tokens):
    return [stemmer.stem(item) for item in tokens]

def normalize(text):
    return stem_tokens(nltk.word_tokenize(text.lower().translate(remove_punctuation_map)))


def tokenize(text):
    text = text.lower() # lower case
    for e in set(string.punctuation+'\n'+'\t'): # remove punctuation and line breaks/tabs
        text = text.replace(e, ' ') 
    for i in range(0,10): # remove double spaces
        text = text.replace('  ', ' ')
    text = text.translate(string.punctuation)  # punctuation
    tokens = nltk.word_tokenize(text)
    text = [w for w in tokens if not w in stopwords.words('english')] # stopwords
    stems = []
    for item in tokens: # stem
        stems.append(PorterStemmer().stem(item))
    return stems


tfidf = TfidfVectorizer(tokenizer=normalize, stop_words='english').fit_transform(documents)
print('DONE')

DONE


In [None]:
from sklearn.decomposition import TruncatedSVD
tfidf = tfidf.toarray()
model = TSNE(n_components=2, perplexity=200, verbose=2, n_iter=2000, n_iter_without_progress=300).fit_transform(tfidf)

# save to json file
x_axis=model[:,0]
y_axis=model[:,1]
x_norm = (x_axis-np.min(x_axis)) / (np.max(x_axis) - np.min(x_axis))
y_norm = (y_axis-np.min(y_axis)) / (np.max(y_axis) - np.min(y_axis))
data = {"x":x_norm.tolist(), "y":y_norm.tolist()}
with open('data.json', 'w') as outfile:
    json.dump(data, outfile)

df.to_csv('labels.csv', header=True, encoding='utf-8', columns=['name','dept'])
df['dept'].value_counts()

[t-SNE] Computing pairwise distances...
[t-SNE] Computing 601 nearest neighbors...
[t-SNE] Computed conditional probabilities for sample 1000 / 1007
[t-SNE] Computed conditional probabilities for sample 1007 / 1007
[t-SNE] Mean sigma: 0.367226
[t-SNE] Iteration 25: error = 0.8121522, gradient norm = 0.0027706
[t-SNE] Iteration 50: error = 0.7590531, gradient norm = 0.0031140
[t-SNE] Iteration 75: error = 0.5950770, gradient norm = 0.0016228
[t-SNE] Iteration 100: error = 0.5657709, gradient norm = 0.0023671
[t-SNE] KL divergence after 100 iterations with early exaggeration: 0.565771
[t-SNE] Iteration 125: error = 0.5149027, gradient norm = 0.0015342
[t-SNE] Iteration 150: error = 0.5011275, gradient norm = 0.0021571
