In [32]:
import re
import pandas as pd
import os
import nltk
from sklearn.feature_extraction.text import  TfidfVectorizer
import pickle
import numpy as np
import gensim


In [87]:
# removing numbers , special character , stopwords , single character, 2 char based words  
def number_special_stop(text,stop):
    text=str(text).lower()
    text=re.sub('[\d\W]',' ',text)
    words=text.split(' ')
    text=' '.join([word for word in words if len(word)>2 and word not in stop])
    return text

#  findingout stemming words, lemmatized words, pos_tag of each word
def stem_lemma_posTag(words):
    stemmer=nltk.stem.PorterStemmer()
    lemmatizer = nltk.stem.WordNetLemmatizer()
    
    # stemming
    if type(words) is list:   # checking  argument is list or text line
        words=list(set(words))   # unique words in word_list
    else:
        words=list(set(text.split(' ')))   # making list of words 
    stem_words=[stemmer.stem(word.strip()) for word in words]
    stem_text=' '.join(stem_words)
    # lemmatization
    try:
        lemma_words=[lemmatizer.lemmatize(word.strip()) for word in words]
        lemma_text=' '.join(lemma_words)
    except LookupError:
        nltk.download('wordnet')   # download if not available 
        lemma_words=[lemmatizer.lemmatize(word.strip()) for word in words]
        lemma_text=' '.join(lemma_words)
    # pos_tag
    try:
        pos_tags=nltk.pos_tag(words)
    except LookupError:
        nltk.download('averaged_perceptron_tagger')
        pos_tags=nltk.pos_tag(words)
    return stem_text,lemma_text,pos_tags

# finding frequency of each unique words 
def frequency_word(words):
    if type(words) is list:
        words=words
    else:
        words=words.split(' ')
    freq={word:words.count(word) for word in set(words)}
    return freq


In [34]:
path='/home/shiva/Desktop/sample1/'
files=os.listdir(path)
f=open(os.path.join(path,files[2]),'r',encoding='utf-8').read()

In [35]:
ex_stopwords = ['cir','nos','vii','llc']
try:
    stop = nltk.corpus.stopwords.words('english')
except LookupError:
    nltk.download('stopwords')
    stop = nltk.corpus.stopwords.words('english')
for st_word in ex_stopwords:
    stop.append(st_word)

In [36]:
text=number_special_stop(f,stop)
stem,lemma,pos=stem_lemma_posTag(text)
freq=frequency_word(lemma)

In [165]:
class tfidf:
    """
        initilizing tfid model 
    """
    def __init__(self,max_feature=144):
        
        self.tfid=TfidfVectorizer(ngram_range=(1,2),norm='l1',max_features=max_feature)
    def vectorize(self,data):
        try:
            self.data=self.tfid.fit_transform(data)
            self.data=self.data.toarray()
        except ValueError:
            return "plese use as: ['texts'] for single sample"
        return self.data
    
class word_vector:
    """
        model_name: this cunstructor gives you flexibility to use pretrained models
        Ex: 
        1 model_name='glove'
        2 model_name='word2vec_trained'
        
        use own model by training 
        Note: Default is to train own model
        you can use according requirement of your data
        
        """
    def __init__(self,model_name=None):  # initilize glove and word2vec model
        self.model_name=model_name
        if self.model_name=='glove':
            self.model=pickle.load(open('glove_model.pkl','rb')) # glove pickle model
            print(' glove model loaded .. ')
        elif self.model_name=='word2vec_trained':
            self.model=gensim.models.KeyedVectors.load_word2vec_format('https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz', binary=True)
            print(' word2vec_trained model loaded .. ')
        else:
            self.model=gensim.models.Word2Vec(sentences,window=3,workers=4, min_count=2)
            print('you need to train model first on own corpus !!')
        
    def vectorize(self,words_list): # taking glove model default 
        """
          word_list:2D list of words
          Ex:[['hi'],['bye','wow']]
        """
        self.vectors=[]    # returning vectorized data 
        self.count=0       # counting special words 
        self.special_words=[]  # list for those words which does n't exist in model 
        for words in words_list:
            vec=[]
            for word in words:   # required 1D words only for vectorizing  
                try:
                    vec.append(self.model.get_vector(word).mean()) # 1 value for each word ,by mean of getting model vector
                except KeyError:
                    print('{} : Not found word : {} '.format(self.count,word))
                    self.count+=1
                    self.special_words.append(word) # list of special words which didn't found in model
            vec=np.array(vec)
            vec.resize(144)
            self.vectors.append(vec)
        return np.array(self.vectors) 
    def train(self,corpus,epochs=10):
        """
          corpus:2D list of words
          Ex:[['hi'],['bye','wow']]
        """
        if type(corpus)==str:
            corpus=corpus.split(' ')
        try:
            self.model.train(corpus, total_examples=len(corpus), epochs=epochs)
            self.model=self.model.wv
            print('training completed...')
        except Exception as e:
            print(e)
        return self.model.vocab 
        
                       

In [85]:
import pandas as pd
df=pd.read_csv('sample1.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,page,summary
0,0,Page 514 506 Fed.Appx. 514 (,"2013) DAVID A. NOWICKI and BARBARA C. TREMEL,..."
1,1,Page 520 506 Fed.Appx. 520 (,"2013) UNITED STATES OF AMERICA, Plaintiff-App..."
2,2,Page 523 506 Fed.Appx. 523 (,"2013) UNITED STATES OF AMERICA, Plaintiff-App..."
3,3,Page 525 506 Fed.Appx. 525 (,"2013) UNITED STATES OF AMERICA, Plaintiff-App..."
4,4,Page 606 507 Fed.Appx. 606 (,"2013) UNITED STATES OF AMERICA, Plaintiff-App..."


In [126]:
# df.summary=df.summary.apply(lambda x:number_special_stop(x,stop))
# from gensim.models import Word2Vec
# sentences=[line.split(' ') for line in df.summary[:10]]
# text=sentences
# word2vec_model = Word2Vec(text,window=3,workers=4, min_count=2)
# vocab=word2vec_model.train(text,total_examples=len(text),epochs=10)
# vocab=word2vec_model.wv.get_vector('ok')
# vocab